def main(): parser = argparse.ArgumentParser('Broad cocleaning (Inderrealignment and BQSR) pipeline') # Logging flags. parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags. parser.add_argument('-r', '--reference_fasta_name', required = True, help = 'Reference fasta path.', ) parser.add_argument('-indel','--known_1k_genome_indel_sites', required=True, help='Reference INDEL path.', ) parser.add_argument('-snp','--dbsnp_known_snp_sites', required=True, help='Reference SNP path.', ) parser.add_argument('-b', '--harmonized_bam_path', required = False, action="append", help = 'Source bam path.', ) parser.add_argument('-list', '--harmonized_bam_list_path', required = False, help = 'Source bam list path.', ) parser.add_argument('-s', '--scratch_dir', required = False, type = is_dir, help = 'Scratch file directory.', ) parser.add_argument('-l', '--log_dir', required = False, type = is_dir, help = 'Log file directory.', ) parser.add_argument('-j', '--thread_count', required = True, type = is_nat, help = 'Maximum number of threads for execution.', ) parser.add_argument('-u', '--uuid', required = True, help = 'analysis_id string', ) parser.add_argument('-m', '--md5', required = False, action = 'store_true', help = 'calculate final size/MD5', ) parser.add_argument('-e', '--eliminate_intermediate_files', required = False, action = 'store_true', help = 'do not (really) reduce disk usage. set if you want to use more disk space!' ) args = parser.parse_args() reference_fasta_name = args.reference_fasta_name known_1k_genome_indel_sites = args.known_1k_genome_indel_sites dbsnp_known_snp_sites = args.dbsnp_known_snp_sites uuid = args.uuid harmonized_bam_path = args.harmonized_bam_path if not args.harmonized_bam_list_path: list_dir = os.path.dirname(harmonized_bam_path[0]) harmonized_bam_list_path = os.path.join(list_dir, uuid + '_harmonized_bam_list.list') with open(harmonized_bam_list_path, "w") as handle: for bam in harmonized_bam_path: handle.write(bam + "\n") else: harmonized_bam_list_path = args.harmonized_bam_list_path if not args.scratch_dir: scratch_dir = os.path.dirname(harmonized_bam_list_path) else: scratch_dir = args.scratch_dir if not args.log_dir: log_dir = os.path.dirname(harmonized_bam_list_path) else: log_dir = args.log_dir thread_count = str(args.thread_count) if not args.eliminate_intermediate_files: eliminate_intermediate_files = True else: eliminate_intermediate_files = False if not args.md5: md5 = False else: md5 = True ##logging logging.basicConfig( filename=os.path.join(log_dir, 'Broad_cocleaning_' + uuid + '.log'), # /host for docker level=args.level, filemode='a', format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('harmonized_bam_list_path=%s' % harmonized_bam_list_path) if not args.harmonized_bam_path: with open(harmonized_bam_list_path) as f: harmonized_bam_path = f.read().splitlines() for path in harmonized_bam_path: logger.info('harmonized_bam_path=%s' % path) else: for path in harmonized_bam_path: logger.info('harmonized_bam_path=%s' % path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_Broad_cocleaning.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') ##Pipeline #check .bai file, call samtools index if not exist RealignerTargetCreator.index(uuid, harmonized_bam_list_path, engine, logger) #call RealignerTargetCreator for harmonized bam list harmonized_bam_intervals_path = RealignerTargetCreator.RTC(uuid, harmonized_bam_list_path, thread_count, reference_fasta_name, known_1k_genome_indel_sites, engine, logger) #call IndelRealigner together but save the reads in the output coresponding to the input that the read came from. harmonized_IR_bam_list_path = IndelRealigner.IR(uuid, harmonized_bam_list_path, reference_fasta_name, known_1k_genome_indel_sites, harmonized_bam_intervals_path, engine, logger) #call BQSR table individually and apply it on bam Analysis_ready_bam_list_path = [] for bam in harmonized_IR_bam_list_path: harmonized_IR_bam_BQSR_table_path = BaseRecalibrator.BQSR(uuid, bam, thread_count, reference_fasta_name, dbsnp_known_snp_sites, engine, logger) Analysis_ready_bam_path = PrintReads.PR(uuid, bam, thread_count, reference_fasta_name, harmonized_IR_bam_BQSR_table_path, engine, logger) bam_validate.bam_validate(uuid, Analysis_ready_bam_path, engine, logger) Analysis_ready_bam_list_path.append(Analysis_ready_bam_path) if md5: for bam in Analysis_ready_bam_list_path: bam_name = os.path.basename(bam) bam_dir = os.path.dirname(bam) bam_basename, bam_ext = os.path.splitext(bam_name) bai_name = bam_basename + '.bai' bai_path = os.path.join(bam_dir, bai_name) verify_util.store_md5_size(uuid, bam, engine, logger) verify_util.store_md5_size(uuid, bai_path, engine, logger) if eliminate_intermediate_files: pipe_util.remove_file_list(uuid, harmonized_IR_bam_list_path, engine, logger) for bam in Analysis_ready_bam_list_path: validate_file = bam_validate.bam_validate(uuid, bam, engine, logger)
def main(): parser = argparse.ArgumentParser('miRNA harmonization') # Logging flag parser.add_argument( '-d', '--debug', action='store_const', const=logging.DEBUG, dest='level', help='Enable debug logging.', ) parser.set_defaults(level=logging.INFO) # Required flags parser.add_argument( '-r', '--reference_fasta_path', required=False, help='Reference fasta path.', ) parser.add_argument( '-b', '--bam_path', nargs='?', default=[sys.stdin], help='Source bam path.', ) parser.add_argument( '-l', '--log_dir', required=False, type=is_dir, help='Log file directory.', ) parser.add_argument( '-u', '--uuid', required=False, help='analysis_id string', ) args = parser.parse_args() reference_fasta_path = args.reference_fasta_path preharmonized_bam_path = args.bam_path log_dir = args.log_dir uuid = args.uuid # Logging Setup logging.basicConfig( filename=os.path.join(log_dir, 'aln_' + uuid + '.log'), filemode='a', level=args.level, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('preharmonized_bam_path=%s' % preharmonized_bam_path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_harmonize.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') bam_validate.bam_validate(uuid, preharmonized_bam_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, preharmonized_bam_path, reference_fasta_path, engine, logger) readgroup_path_dict = bam_util.write_readgroups(uuid, preharmonized_bam_path, engine, logger) bam_util.bam_to_fastq(uuid, preharmonized_bam_path, engine, logger) top_dir = os.path.dirname(preharmonized_bam_path) fastq_dir = os.path.join(top_dir, 'fastq') fastq_validate.fastq_validate(uuid, fastq_dir, engine, logger) # Harmonization be_lenient = False harmonized_readgroup_bam_path_list = bam_util.bwa(uuid, preharmonized_bam_path, reference_fasta_path, readgroup_path_dict, engine, logger) fastq_list = fastq_util.buildfastqlist(fastq_dir, logger) fastq_path_list = [os.path.join(fastq_dir, fastq) for fastq in fastq_list] for harmonized_readgroup_bam_path in harmonized_readgroup_bam_path_list: if pipe_util.is_aln_bam(harmonized_readgroup_bam_path, logger): be_lenient = True harmonized_sorted_bam_path_list = picard_bam_sort.bam_sort( uuid, preharmonized_bam_path, harmonized_readgroup_bam_path_list, reference_fasta_path, engine, logger, be_lenient) harmonized_bam_merge_path = picard_bam_merge.bam_merge( uuid, preharmonized_bam_path, harmonized_sorted_bam_path_list, engine, logger, be_lenient) bam_validate.bam_validate(uuid, harmonized_bam_merge_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, harmonized_bam_merge_path, reference_fasta_path, engine, logger)
def main(): parser = argparse.ArgumentParser('miRNA harmonization') # Logging flag parser.add_argument('-d', '--debug', action = 'store_const', const = logging.DEBUG, dest = 'level', help = 'Enable debug logging.', ) parser.set_defaults(level = logging.INFO) # Required flags parser.add_argument('-r', '--reference_fasta_path', required = False, help = 'Reference fasta path.', ) parser.add_argument('-b', '--bam_path', nargs = '?', default = [sys.stdin], help = 'Source bam path.', ) parser.add_argument('-l', '--log_dir', required = False, type = is_dir, help = 'Log file directory.', ) parser.add_argument('-u', '--uuid', required = False, help = 'analysis_id string', ) args = parser.parse_args() reference_fasta_path = args.reference_fasta_path preharmonized_bam_path = args.bam_path log_dir = args.log_dir uuid = args.uuid # Logging Setup logging.basicConfig( filename = os.path.join(log_dir, 'aln_' + uuid + '.log'), filemode = 'a', level = args.level, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger = logging.getLogger(__name__) hostname = os.uname()[1] logger.info('hostname=%s' % hostname) logger.info('preharmonized_bam_path=%s' % preharmonized_bam_path) engine_path = 'sqlite:///' + os.path.join(log_dir, uuid + '_harmonize.db') engine = sqlalchemy.create_engine(engine_path, isolation_level='SERIALIZABLE') bam_validate.bam_validate(uuid, preharmonized_bam_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, preharmonized_bam_path, reference_fasta_path, engine, logger) readgroup_path_dict = bam_util.write_readgroups(uuid, preharmonized_bam_path, engine, logger) bam_util.bam_to_fastq(uuid, preharmonized_bam_path, engine, logger) top_dir = os.path.dirname(preharmonized_bam_path) fastq_dir = os.path.join(top_dir, 'fastq') fastq_validate.fastq_validate(uuid, fastq_dir, engine, logger) # Harmonization be_lenient = False harmonized_readgroup_bam_path_list = bam_util.bwa(uuid, preharmonized_bam_path, reference_fasta_path, readgroup_path_dict, engine, logger) fastq_list = fastq_util.buildfastqlist(fastq_dir, logger) fastq_path_list = [os.path.join(fastq_dir, fastq) for fastq in fastq_list] for harmonized_readgroup_bam_path in harmonized_readgroup_bam_path_list: if pipe_util.is_aln_bam(harmonized_readgroup_bam_path, logger): be_lenient = True harmonized_sorted_bam_path_list = picard_bam_sort.bam_sort(uuid, preharmonized_bam_path, harmonized_readgroup_bam_path_list, reference_fasta_path, engine, logger, be_lenient) harmonized_bam_merge_path = picard_bam_merge.bam_merge(uuid, preharmonized_bam_path, harmonized_sorted_bam_path_list, engine, logger, be_lenient) bam_validate.bam_validate(uuid, harmonized_bam_merge_path, engine, logger) bam_stats.do_samtools_flagstat(uuid, harmonized_bam_merge_path, reference_fasta_path, engine, logger)
def main(): parser = argparse.ArgumentParser('harmonization pipeline') # Logging flags. parser.add_argument('-d','--debug', action='store_const', const=logging.DEBUG, dest='level', help='Enable debug logging.', ) parser.set_defaults(level=logging.INFO) # Required flags. parser.add_argument('-g','--s3_reference_bucket', required=True, help='S3 bucket name containing reference fasta.', ) parser.add_argument('-b','--s3_bam_bucket', required=True, help='S3 bucket name containing source bam file.', ) parser.add_argument('-r','--reference_fasta_name', required=True, help='Reference fasta S3 key name.', ) parser.add_argument('-a','--bam_analysis_id', required=True, help='Source bam file S3 key name.', ) parser.add_argument('-s','--scratch_dir', required=True, type=is_dir, help='Scratch file directory.', ) parser.add_argument('-l','--log_dir', required=True, type=is_dir, help='Log file directory.', ) parser.add_argument('-f','--s3cfg_dir', required=True, type=is_dir, help='.s3cfg file directory.', ) parser.add_argument('-t','--thread_count', required=True, type=is_nat, help='Maximum number of threads for execution.', ) parser.add_argument('-c','--csv_stats', required=True, help='Write to csv rather than postgres' ) parser.add_argument('-u','--postgres_user', required=False, help='postgres username' ) parser.add_argument('-p','--postgres_password', required=False, help='postgres password' ) parser.add_argument('-n','--postgres_hostname', required=False, help='postgres hostname' ) args = parser.parse_args() s3_reference_bucket = args.s3_reference_bucket s3_bam_bucket = args.s3_bam_bucket reference_fasta_name = args.reference_fasta_name reference_fasta_name = reference_fasta_name.strip('/') bam_analysis_id = args.bam_analysis_id bam_analysis_id = bam_analysis_id.strip('/') scratch_dir = args.scratch_dir log_dir = args.log_dir s3cfg_dir= args.s3cfg_dir thread_count = str(args.thread_count) csv_stats = literal_eval(args.csv_stats) postgres_user = args.postgres_user postgres_password = args.postgres_password postgres_hostname = args.postgres_hostname ##logging uuid=pipe_util.get_uuid_from_path(bam_analysis_id) logging.basicConfig( filename=os.path.join(log_dir,'aln_'+uuid+'.log'), #/host for docker level=args.level, filemode='a', format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d_%H:%M:%S_%Z', ) logging.getLogger('sqlalchemy.engine').setLevel(logging.INFO) logger=logging.getLogger(__name__) hostname=os.uname()[1] logger.info('hostname=%s' % hostname) ##open stats and timing db if not csv_stats: if (postgres_user is None) or (postgres_password is None) or (postgres_hostname is None): logger.debug('must enter postgres user, password and hostname if not writing to csv (or use -c True)') sys.exit(1) engine_path='postgresql://'+postgres_user+':'+postgres_password+'@'+postgres_hostname+'/gdc_harmonize' engine=sqlalchemy.create_engine(engine_path,isolation_level='SERIALIZABLE') ##get reference genome get_s3_objects(uuid,s3_reference_bucket,reference_fasta_name,scratch_dir,s3cfg_dir,engine,logger) reference_fasta_path=os.path.join(scratch_dir,reference_fasta_name) logger.info('scratch_dir=%s' % scratch_dir) logger.info('reference_fasta_name=%s' % reference_fasta_name) logger.info('reference_fasta_path=%s' % reference_fasta_path) ##get bam to be harmonized get_s3_objects(uuid,s3_bam_bucket,bam_analysis_id,scratch_dir,s3cfg_dir,engine,logger) bam_path=bam_util.get_bam_path(bam_analysis_id,scratch_dir,logger) #get original reference genome for stats original_fasta_name=bam_util.get_bam_reference(bam_path,logger) if not (original_fasta_name.endswith('.fa') or original_fasta_name.endswith('.fasta')): original_fasta_name+='.fa' original_fasta_path=os.path.join(scratch_dir,original_fasta_name) get_s3_objects(uuid,s3_reference_bucket,original_fasta_name,scratch_dir,s3cfg_dir,engine,logger) ##pipeline #verify_util.verify_cgquery_md5(reference_fasta_name,logger) verify_util.verify_cgquery(uuid,bam_path,engine,logger) bam_validate.bam_validate(uuid,bam_path,engine,logger) bam_stats.bam_stats(uuid,bam_path,original_fasta_path,engine,logger) bam_util.bam_to_fastq(uuid,bam_path,engine,logger) fastq_length=fastq_validate.fastqc_validate(uuid,bam_path,thread_count,engine,logger) readgroup_path_dict=bam_util.write_readgroups(uuid,bam_path,engine,logger)#to file and db #MEM_ALN_CUTOFF=70 #if fastq_length<MEM_ALN_CUTOFF: # do bwa aln... # bam_path_list=bwa.bwa(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger) # bam_sort_path_list=bam_sort.bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger) # bam_merge_path=bam_merge.bam_merge(uuid,bam_path,bam_sort_path_list,engine,logger) # bam_md_path=bam_mark_duplicates.bam_mark_duplicates(uuid,bam_merge_path,thread_count,engine,logger) # bam_validate.bam_validate(uuid,bam_md_path,engine,logger) # bam_stats.bam_stats(uuid,bam_md_path,reference_fasta_path,engine,logger) # verify_util.store_md5_size(uuid,bam_md_path,engine,logger) # bai_md_path=bam_md_path+'.bai' # verify_util.store_md5_size(uuid,bai_md_path,engine,logger) # ...and do bwa mem bam_path_list=bwa_mem.bwa_mem(uuid,bam_path,reference_fasta_path,readgroup_path_dict,thread_count,engine,logger) bam_sort_path_list=bam_sort.bam_sort(uuid,bam_path,bam_path_list,reference_fasta_path,thread_count,engine,logger) bam_merge_path=bam_merge.bam_merge(uuid,bam_path,bam_sort_path_list,engine,logger) bam_md_path=bam_mark_duplicates.bam_mark_duplicates(uuid,bam_merge_path,thread_count,engine,logger) bam_validate.bam_validate(uuid,bam_md_path,engine,logger) #bam_stats.bam_stats(uuid,bam_md_path,reference_fasta_path,engine,logger) verify_util.store_md5_size(uuid,bam_md_path,engine,logger) bai_md_path=bam_md_path+'.bai' verify_util.store_md5_size(uuid,bai_md_path,engine,logger)