Esempio n. 1
0
File: stmp.py Progetto: ysm0128/stmp
		stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, print_sql_query_only=True, debug=args.debug_point_annotations)
		exit(0)
	
	if args.print_region_cmd_only:
		stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, yaml_commands=yaml_commands, print_range_cmd_only=True, force_overwrite_beds=args.clean_beds)
		exit(0)
	
	# pgx doesn't depend on other annotations
	if(args.pgx_only):
		pgx(args)
		exit(0)
	
	# tiering requires annotation to already be done
	if(args.tiering_only):
		joined_outfile = stmp_annotation_util.generateJoinedOutfilePath(args.output_dir, stmp_annotation_util.getSampleName(args.vcf)) # generates path to outfile but does not create it
		stmp_annotation_checker.check_annotated_output(joined_outfile)
		tier_real(args, joined_outfile, yaml_commands)
		exit(0)
	
	if(args.annotate_only):
		joined_outfile = annotate(args)
		stmp_annotation_checker.check_annotated_output(joined_outfile)
		exit(0)
		
		
	######## DEFAULT #######
	
	# Run full pipeline (global, clinical, candidate, secondary, pgx)
	
	#annotation is common to all modes
	joined_outfile = annotate(args)
Esempio n. 2
0
def main():
	# parse args
	[parser, args] = parse_args(sys.argv[1:])
	
	# parse YAML
	yaml_commands= yaml_utils.parse_yaml_input_files(args.yaml, args.modules)
	if(args.download_datasets_only):
		stmp_annotation_util.downloadDBs(yaml_commands, args.dataset_output_dir, args.log)
		sys.exit(0)
	elif args.check_datasets_only:
		stmp_annotation_util.checkDBs(yaml_commands, args.dataset_output_dir)
		sys.exit(0)
	
	#open the connection to the database
	db_conn = stmp_annotation_util.connect_db(db_file=args.database_file, host_name='', user_name='', db_name='', unix_socket_loc='')
	
	if args.drop_samples:
		stmp_annotation_util.drop_samples(db_conn)
	
	if args.update_db: # check if database setup is requested
		if args.input_directory != None:
			args.update_db = stmp_annotation_util.root_or_cwd(args.input_directory) # complete the filepath if an absolute filepath is not provided.
			stmp_annotation_util.setup_db(args.input_directory, db_conn, not args.force_overwrite) # Launch DB updating process.
		elif args.yaml != None:
			stmp_annotation_util.setup_db_yaml(db_conn, yaml_commands, not args.force_overwrite)
		else:
			print 'Error: neither YAML nor directory with input datasets specified'
			parser.print_help()
			sys.exit(1)
	
	if(args.test):
		# use our test data as the input vcf
		args.vcf = general_utils.root_or_code_dir(yaml_commands[yaml_keys.kModules][yaml_keys.kTesting][yaml_keys.kTeTestDatasetPath])
		if(args.output_dir == None):
			args.output_dir = general_utils.root_or_code_dir(yaml_commands[yaml_keys.kModules][yaml_keys.kTesting][yaml_keys.kTeTestDefaultOutPath])
	
	if args.vcf != None: # annotation
		
		# run preflight checks
		stmp_preflight_checker.preflight_checks(yaml_commands, db_conn)
		# TODO split into separate checks based on what we're running (e.g. just annotation, just tiering, etc.)
		
		#Files and Directories
		args.vcf = stmp_annotation_util.root_or_cwd(args.vcf) # complete the filepath if an absolute filepath is not provided.
		args.output_dir = stmp_annotation_util.root_or_cwd(args.output_dir) # ditto
		args.scratch_dir = os.path.join(args.output_dir, 'scratch')
		if not os.path.exists(args.scratch_dir):
			os.makedirs(args.scratch_dir)
		
		#Convert multiallelic to single line for easier merging of functional annotations later
		if (args.reuse_multiallelic and not args.skip_multiallelic) or args.print_sql_query_only or args.print_region_cmd_only:
			noMultialllic_vcf = stmp_annotation_util.splitMultiallelic(args.vcf, args.scratch_dir, skip_if_exists=True)
			args.vcf = noMultialllic_vcf
		elif not args.skip_multiallelic:
			noMultialllic_vcf = stmp_annotation_util.splitMultiallelic(args.vcf, args.scratch_dir)
			args.vcf = noMultialllic_vcf
		else:
			print 'Skipping multiallelic check'
		
		# strip chr prefix
		stripChr_vcf = stmp_annotation_util.stripChrPrefix(args.vcf, args.scratch_dir, skip_if_exists=True)
		args.vcf=stripChr_vcf
		
		# extract FORMAT tags
		# TODO finish
	# 	modifiedVCF = 
	
	############### MAIN FUNCTIONS (ANNOTATION, TIERING, PGX) ################
		
		# ANNOTATION MAIN FUNCTION
		def annotate(args):		
			if args.region_annotations_only:
				#region annotation does not require sample vcf to be uploaded to our db
				region_outfiles = stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, force_overwrite_beds=args.clean_beds)
				exit(0)
			
			#Functional variant effect prediction
			#snpeff
			if(not args.point_annotations_only):
				[snpeff_proc, sample_name, snpeff_vcf_outfile] = stmp_annotation_util.snpeff(args.vcf, args.scratch_dir, yaml_commands) # launch snpEff on the VCF.  This will chug in the background, and the script will wait for the completion of the returned process at the end if it hasn't terminated by that time.
			
			#Actual Annotation
			# upload vcf
			sample_db_path = stmp_annotation_util.upload_vcf(db_conn, args.vcf, args.scratch_dir, args.force_input_overwrite)
	
			# annovar + region annotation
			if not args.point_annotations_only:
				[annovar_proc, annovar_outfile] = stmp_annotation_util.annovar_annotate_functional(args.vcf, args.scratch_dir)
	# 			stmp_annotation_util.annotate_range(database_connection, vcf_file_loc, output_dir_loc, modules_yaml_path, datasets_yaml_path=args.yaml, yaml_commands, skip, print_range_cmd_only, presorted_vcf, force_overwrite_beds)
	# 			p = Process(target=stmp_annotation_util.annotate_range, args=(db_conn, args.vcf, args.scratch_dir, args.modules, args.yaml, yaml_commands, args.clean_beds))
				region_outfiles = stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, force_overwrite_beds=args.clean_beds) # Find annotations which cover a genomic interval.  This is done with BEDtools.
			
			# point annotation
			point_outfiles = stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, debug=args.debug_point_annotations) # Find annotations which are associated with a single locus.  This is done with a SQL join.
			if args.point_annotations_only:
				exit(0) # stop after point annotation done
		
			#wait for snpeff and annovar to complete, if they haven't already.
			print 'Waiting for snpeff to complete...'
			snpeff_proc.wait()
			print 'Waiting for annovar to complete...'
			annovar_proc.wait()
				
			# convert snpeff vcf to tsv (remove all lines with '#')
			snpeff_tsv_outfile = stmp_annotation_util.snpeff2tsv(sample_name, snpeff_vcf_outfile, args.scratch_dir)
		
			#join the results into a single file
			joined_outfile = stmp_annotation_util.joinFiles(args.vcf, snpeff_tsv_outfile, annovar_outfile, region_outfiles, point_outfiles, args.output_dir, args.skip_join_checks, yaml_commands=yaml_commands, skip_annovar=args.skip_annovar)
			
			# TODO may need to clean up temporary range annotation files (in output/scratch dir) to avoid issues with region annotation.
			# for now (must do this to avoid issues with region annotation)
	# 		print 'Cleaning up temporary range annotation files'
	# 		cmd = 'rm -f /tmp/stmp2/intersected/*'
	# # 		cmd = 'mv /tmp/stmp2/intersected/* /tmp/stmp2/intersected/old/'
	# 		subprocess.Popen(cmd, shell=True).wait()
			
			print 'Done annotating ' + joined_outfile
			return joined_outfile
		
		# end annotation function
		
		
		#######################################################################
		####### Variant Tiering ##############
		
		## HELPER FUNCTIONS
		#wrapper to call mapped processes
		def call_process(command):
		    print "Processing command: "+command
		    status = "Return code for command "+command+":"+str(os.system(command))
		    return status
		   
		#function to add rsid to GATK output
		def add_rsid(intervals, in_file, out_file):
			f1 = open(intervals, "r")
			f2 = stmp_annotation_util.open_compressed_or_regular(in_file, "r")
			f3 = open(out_file, "w")
		
			rsdict = {}
			for line in f1:
		        	if ("#" in line) == 0:
		                	linelist = line.split("\t")
		                	rsdict[linelist[0]+":"+linelist[1]] = linelist[2].replace("target_", "").replace("\n", "")
		
			while 1:
				temp = f2.readline()
		    		if not temp:
		       			break
		    		else:
		        		if (("#" in temp) == 0):
		                		linelist = temp.split("\t")
		                		if rsdict.has_key(linelist[0]+":"+linelist[1]):
		                        		f3.write(linelist[0]+"\t"+linelist[1]+"\t"+rsdict[linelist[0]+":"+linelist[1]]+"\t"+"\t".join(linelist[3:len(linelist)]))
		        		else:
		                		f3.write(temp)
			f1.close()
			f2.close()
			f3.close()
		
		
		# TIERING (VARIANT PRIORITIZATION) MAIN FUNCTION
		def tier(args, annotation_joined_outfile, yaml_cmds, output_dir=None, tier_name='Global'):
			## MAIN CODE
			print 'Performing variant prioritization'
			
			if(output_dir == None):
				output_dir = os.path.join(args.output_dir, 'tiering_allvars')
			else: 
				output_dir = os.path.join(output_dir, 'tiering_allvars')
				
			# targeted tiering only if BAM file provided? (TODO)
			# stmp_tiering_util.tiers_target(os.path.join(args.output_dir, 'tiers_target.tsv'), os.path.join(args.output_dir, 'tiers_target'), args.target_genes, pop=args.ethnicity)
			
			# standard VCF prioritization (tiering)
			stmp_tiering_util.tiers_allvars(annotation_joined_outfile, output_dir, args.target_genes, pop=args.ethnicity, yaml_cmds=yaml_cmds)
			
			# SFS filtering (TODO)
			if args.sfs_file != "None":
				print 'Performing SFS filtering (check output at ' + str(output_dir) + ')'
				for i in range(1,5):
					stmp_tiering_util.filter_sfs(output_dir+'.tier'+str(i)+'.txt', args.sfs_file, output_dir+'.tier'+str(i)+"-sfs_filtered.txt", 2)
		
		# end tiering function
		
		# real tiering function (uses above function as helper)
		def tier_real(args, joined_outfile, yaml_commands):
			#tiering is separate
			tiering_output_dirs = []
			
			# 1. candidate genes (user-specified target gene list)
			if(args.target_genes != None):
				args.target_genes = general_utils.root_or_cwd(args.target_genes)
				candidate_out_dir = os.path.join(args.output_dir, 'Candidate')
				if(not os.path.isdir(candidate_out_dir)):
					os.makedirs(candidate_out_dir)
				print 'Tiering candidate genes (user-specified)'
				tier(args, joined_outfile, yaml_cmds=yaml_commands, output_dir=candidate_out_dir)
				tiering_output_dirs.append(candidate_out_dir)
			
			# 2. global
			global_tiering_out_dir = os.path.join(args.output_dir, 'Global')
			if(not os.path.isdir(global_tiering_out_dir)):
				os.makedirs(global_tiering_out_dir)
			args.target_genes = None # forces global tiering (no filtering based on a certain gene list)
			print 'Tiering all genes (Global)'
			tier(args, joined_outfile, yaml_cmds=yaml_commands, output_dir=global_tiering_out_dir)
			tiering_output_dirs.append(global_tiering_out_dir)
			
			#3. other gene lists (specified in YAML)
			tiering_gene_lists = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTTargetGeneLists]
			for tiering_gene_list in tiering_gene_lists:
				# ignore all "clinical" gene lists if we are suppressing incidental findings (e.g. for family members)
				if(args.suppress_incidentals and 'clinical' in tiering_gene_list.lower()):
					print 'Note: Skipping tiering for ' + str(tiering_gene_list) + ' to avoid incidental findings.'
					continue
				#else
				out_dir = os.path.join(args.output_dir, tiering_gene_list)
				if(not os.path.isdir(out_dir)):
					os.makedirs(out_dir)
				args.target_genes = yaml_utils.get_abs_path(tiering_gene_lists[tiering_gene_list])
				print 'Tiering ' + str(tiering_gene_list) + ' genes'
				tier(args, joined_outfile, yaml_cmds=yaml_commands, output_dir=out_dir)
				tiering_output_dirs.append(out_dir)
			
			
			# Generate final output as excel workbook
			final_tiering_out_file_path = stmp_tiering_util.tiers2xls(tiering_output_dirs, args.output_dir, yaml_commands)
			print '**** Tiering output written to ' + str(final_tiering_out_file_path) + ' *****'
			return final_tiering_out_file_path
		
		#################################################################################################
		
		########### Pharmgkb and ClinVar Annotation #######################
	
		# PGX MAIN FUNCTION (PHARMGKB AND CLINVAR ANNOTATION)
		def pgx(args, output_dir=None):
			# consts
			if(output_dir == None):
				args.output = os.path.join(args.output_dir, 'pgx')
			else:
				args.output = output_dir
			if(not os.path.isdir(args.output)):
				os.makedirs(args.output)
			# code
			print str(datetime.datetime.now()) + ': Performing PharmGKB + ClinVar (pgx) annotation'
			#pharmgkb annotation
			pgxUtils.pgx_annotator(args.vcf, os.path.join(resources_path, "pgx_vars", "clinical_ann_metadata-snvs.txt"), os.path.join(args.output, "pharmacogenomics"))
			pgxUtils.star_caller(os.path.join(resources_path, "pgx_haps/"), args.vcf, os.path.join(args.output, "pharmacogenomics"))
			print str(datetime.datetime.now()) + ': Done with pgx/clinvar annotation'
		# end pgx function
		
		
	
	###################### MAIN CODE (logic to call main functions) ###############
	
		#just print sql query if specified
		if args.print_sql_query_only:
			sample_db_path = stmp_annotation_util.get_sample_db_path(args.scratch_dir, stmp_annotation_util.getSampleName(args.vcf))
			stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, print_sql_query_only=True, debug=args.debug_point_annotations)
			exit(0)
		
		# in all the cases below, output the yaml commands to the output dir
		yaml_utils.write_output_yaml_files(yaml_commands, args.output_dir)
		
		if args.print_region_cmd_only:
			stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, print_range_cmd_only=True, force_overwrite_beds=args.clean_beds)
			exit(0)
		
		# pgx doesn't depend on other annotations
		if(args.pgx_only):
			pgx(args)
			exit(0)
		
		# tiering requires annotation to already be done
		if(args.tiering_only):
			joined_outfile = stmp_annotation_util.generateJoinedOutfilePath(args.output_dir, stmp_annotation_util.getSampleName(args.vcf)) # generates path to outfile but does not create it
			stmp_annotation_checker.check_annotated_output(joined_outfile)
			tier_real(args, joined_outfile, yaml_commands)
			exit(0)
		
		if(args.annotate_only):
			joined_outfile = annotate(args)
			stmp_annotation_checker.check_annotated_output(joined_outfile)
			exit(0)
			
			
		######## DEFAULT #######
		
		# Run full pipeline (global, clinical, candidate, secondary, pgx)
		
		#annotation is common to all modes
		joined_outfile = annotate(args)
		#check annotated output file
		stmp_annotation_checker.check_annotated_output(joined_outfile)
		
		# tier
		tier_real(args, joined_outfile, yaml_commands)
		
		# 5. pgx - goes in its own folder by default
		pgx(args)
		
		
		#remove sample from database if desired
		if(args.drop_sample):
			stmp_annotation_util.drop_sample(db_conn, args.vcf)
		
		# clean up samples in database if needed
		if(args.drop_samples):
			stmp_annotation_util.drop_samples(db_conn)