def annotate(args): if args.region_annotations_only: #region annotation does not require sample vcf to be uploaded to our db region_outfiles = stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, force_overwrite_beds=args.clean_beds) exit(0) #Functional variant effect prediction #snpeff if(not args.point_annotations_only): [snpeff_proc, sample_name, snpeff_vcf_outfile] = stmp_annotation_util.snpeff(args.vcf, args.scratch_dir, yaml_commands) # launch snpEff on the VCF. This will chug in the background, and the script will wait for the completion of the returned process at the end if it hasn't terminated by that time. #Actual Annotation # upload vcf sample_db_path = stmp_annotation_util.upload_vcf(db_conn, args.vcf, args.scratch_dir, args.force_input_overwrite) # annovar + region annotation if not args.point_annotations_only: [annovar_proc, annovar_outfile] = stmp_annotation_util.annovar_annotate_functional(args.vcf, args.scratch_dir) # stmp_annotation_util.annotate_range(database_connection, vcf_file_loc, output_dir_loc, modules_yaml_path, datasets_yaml_path=args.yaml, yaml_commands, skip, print_range_cmd_only, presorted_vcf, force_overwrite_beds) # p = Process(target=stmp_annotation_util.annotate_range, args=(db_conn, args.vcf, args.scratch_dir, args.modules, args.yaml, yaml_commands, args.clean_beds)) region_outfiles = stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, force_overwrite_beds=args.clean_beds) # Find annotations which cover a genomic interval. This is done with BEDtools. # point annotation point_outfiles = stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, debug=args.debug_point_annotations) # Find annotations which are associated with a single locus. This is done with a SQL join. if args.point_annotations_only: exit(0) # stop after point annotation done #wait for snpeff and annovar to complete, if they haven't already. print 'Waiting for snpeff to complete...' snpeff_proc.wait() print 'Waiting for annovar to complete...' annovar_proc.wait() # convert snpeff vcf to tsv (remove all lines with '#') snpeff_tsv_outfile = stmp_annotation_util.snpeff2tsv(sample_name, snpeff_vcf_outfile, args.scratch_dir) #join the results into a single file joined_outfile = stmp_annotation_util.joinFiles(args.vcf, snpeff_tsv_outfile, annovar_outfile, region_outfiles, point_outfiles, args.output_dir, args.skip_join_checks, yaml_commands=yaml_commands, skip_annovar=args.skip_annovar) # TODO may need to clean up temporary range annotation files (in output/scratch dir) to avoid issues with region annotation. # for now (must do this to avoid issues with region annotation) # print 'Cleaning up temporary range annotation files' # cmd = 'rm -f /tmp/stmp2/intersected/*' # # cmd = 'mv /tmp/stmp2/intersected/* /tmp/stmp2/intersected/old/' # subprocess.Popen(cmd, shell=True).wait() print 'Done annotating ' + joined_outfile return joined_outfile
# to be tested def write_output_yaml(yaml_commands, output_dir): outfile = open(os.path.join(output_dir, 'config.yml'), 'w') yaml.dump(yaml_commands, outfile, default_flow_style=False) return outfile ###################### MAIN CODE (logic to call main functions) ############### #just print sql query if specified if args.print_sql_query_only: sample_db_path = stmp_annotation_util.get_sample_db_path(args.scratch_dir, stmp_annotation_util.getSampleName(args.vcf)) stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, print_sql_query_only=True, debug=args.debug_point_annotations) exit(0) if args.print_region_cmd_only: stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, yaml_commands=yaml_commands, print_range_cmd_only=True, force_overwrite_beds=args.clean_beds) exit(0) # pgx doesn't depend on other annotations if(args.pgx_only): pgx(args) exit(0) # tiering requires annotation to already be done if(args.tiering_only): joined_outfile = stmp_annotation_util.generateJoinedOutfilePath(args.output_dir, stmp_annotation_util.getSampleName(args.vcf)) # generates path to outfile but does not create it stmp_annotation_checker.check_annotated_output(joined_outfile) tier_real(args, joined_outfile, yaml_commands) exit(0) if(args.annotate_only):
print str(datetime.datetime.now()) + ': Done with pgx/clinvar annotation' # end pgx function ###################### MAIN CODE (logic to call main functions) ############### #just print sql query if specified if args.print_sql_query_only: sample_db_path = stmp_annotation_util.get_sample_db_path(args.scratch_dir, stmp_annotation_util.getSampleName(args.vcf)) stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, print_sql_query_only=True, debug=args.debug_point_annotations) exit(0) if args.print_region_cmd_only: stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, db_bed_dir=args.db_bed_dir, print_range_cmd_only=True) exit(0) # pgx doesn't depend on other annotations if(args.pgx_only): pgx(args) exit(0) # tiering requires annotation to already be done if(args.tiering_only): joined_outfile = stmp_annotation_util.generateJoinedOutfilePath(args.output_dir, stmp_annotation_util.getSampleName(args.vcf)) # generates path to outfile but does not create it tier(args, joined_outfile, yaml_cmds=yaml_commands) exit(0) if(args.annotate_only): annotate(args)
def main(): # parse args [parser, args] = parse_args(sys.argv[1:]) # parse YAML yaml_commands= yaml_utils.parse_yaml_input_files(args.yaml, args.modules) if(args.download_datasets_only): stmp_annotation_util.downloadDBs(yaml_commands, args.dataset_output_dir, args.log) sys.exit(0) elif args.check_datasets_only: stmp_annotation_util.checkDBs(yaml_commands, args.dataset_output_dir) sys.exit(0) #open the connection to the database db_conn = stmp_annotation_util.connect_db(db_file=args.database_file, host_name='', user_name='', db_name='', unix_socket_loc='') if args.drop_samples: stmp_annotation_util.drop_samples(db_conn) if args.update_db: # check if database setup is requested if args.input_directory != None: args.update_db = stmp_annotation_util.root_or_cwd(args.input_directory) # complete the filepath if an absolute filepath is not provided. stmp_annotation_util.setup_db(args.input_directory, db_conn, not args.force_overwrite) # Launch DB updating process. elif args.yaml != None: stmp_annotation_util.setup_db_yaml(db_conn, yaml_commands, not args.force_overwrite) else: print 'Error: neither YAML nor directory with input datasets specified' parser.print_help() sys.exit(1) if(args.test): # use our test data as the input vcf args.vcf = general_utils.root_or_code_dir(yaml_commands[yaml_keys.kModules][yaml_keys.kTesting][yaml_keys.kTeTestDatasetPath]) if(args.output_dir == None): args.output_dir = general_utils.root_or_code_dir(yaml_commands[yaml_keys.kModules][yaml_keys.kTesting][yaml_keys.kTeTestDefaultOutPath]) if args.vcf != None: # annotation # run preflight checks stmp_preflight_checker.preflight_checks(yaml_commands, db_conn) # TODO split into separate checks based on what we're running (e.g. just annotation, just tiering, etc.) #Files and Directories args.vcf = stmp_annotation_util.root_or_cwd(args.vcf) # complete the filepath if an absolute filepath is not provided. args.output_dir = stmp_annotation_util.root_or_cwd(args.output_dir) # ditto args.scratch_dir = os.path.join(args.output_dir, 'scratch') if not os.path.exists(args.scratch_dir): os.makedirs(args.scratch_dir) #Convert multiallelic to single line for easier merging of functional annotations later if (args.reuse_multiallelic and not args.skip_multiallelic) or args.print_sql_query_only or args.print_region_cmd_only: noMultialllic_vcf = stmp_annotation_util.splitMultiallelic(args.vcf, args.scratch_dir, skip_if_exists=True) args.vcf = noMultialllic_vcf elif not args.skip_multiallelic: noMultialllic_vcf = stmp_annotation_util.splitMultiallelic(args.vcf, args.scratch_dir) args.vcf = noMultialllic_vcf else: print 'Skipping multiallelic check' # strip chr prefix stripChr_vcf = stmp_annotation_util.stripChrPrefix(args.vcf, args.scratch_dir, skip_if_exists=True) args.vcf=stripChr_vcf # extract FORMAT tags # TODO finish # modifiedVCF = ############### MAIN FUNCTIONS (ANNOTATION, TIERING, PGX) ################ # ANNOTATION MAIN FUNCTION def annotate(args): if args.region_annotations_only: #region annotation does not require sample vcf to be uploaded to our db region_outfiles = stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, force_overwrite_beds=args.clean_beds) exit(0) #Functional variant effect prediction #snpeff if(not args.point_annotations_only): [snpeff_proc, sample_name, snpeff_vcf_outfile] = stmp_annotation_util.snpeff(args.vcf, args.scratch_dir, yaml_commands) # launch snpEff on the VCF. This will chug in the background, and the script will wait for the completion of the returned process at the end if it hasn't terminated by that time. #Actual Annotation # upload vcf sample_db_path = stmp_annotation_util.upload_vcf(db_conn, args.vcf, args.scratch_dir, args.force_input_overwrite) # annovar + region annotation if not args.point_annotations_only: [annovar_proc, annovar_outfile] = stmp_annotation_util.annovar_annotate_functional(args.vcf, args.scratch_dir) # stmp_annotation_util.annotate_range(database_connection, vcf_file_loc, output_dir_loc, modules_yaml_path, datasets_yaml_path=args.yaml, yaml_commands, skip, print_range_cmd_only, presorted_vcf, force_overwrite_beds) # p = Process(target=stmp_annotation_util.annotate_range, args=(db_conn, args.vcf, args.scratch_dir, args.modules, args.yaml, yaml_commands, args.clean_beds)) region_outfiles = stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, force_overwrite_beds=args.clean_beds) # Find annotations which cover a genomic interval. This is done with BEDtools. # point annotation point_outfiles = stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, debug=args.debug_point_annotations) # Find annotations which are associated with a single locus. This is done with a SQL join. if args.point_annotations_only: exit(0) # stop after point annotation done #wait for snpeff and annovar to complete, if they haven't already. print 'Waiting for snpeff to complete...' snpeff_proc.wait() print 'Waiting for annovar to complete...' annovar_proc.wait() # convert snpeff vcf to tsv (remove all lines with '#') snpeff_tsv_outfile = stmp_annotation_util.snpeff2tsv(sample_name, snpeff_vcf_outfile, args.scratch_dir) #join the results into a single file joined_outfile = stmp_annotation_util.joinFiles(args.vcf, snpeff_tsv_outfile, annovar_outfile, region_outfiles, point_outfiles, args.output_dir, args.skip_join_checks, yaml_commands=yaml_commands, skip_annovar=args.skip_annovar) # TODO may need to clean up temporary range annotation files (in output/scratch dir) to avoid issues with region annotation. # for now (must do this to avoid issues with region annotation) # print 'Cleaning up temporary range annotation files' # cmd = 'rm -f /tmp/stmp2/intersected/*' # # cmd = 'mv /tmp/stmp2/intersected/* /tmp/stmp2/intersected/old/' # subprocess.Popen(cmd, shell=True).wait() print 'Done annotating ' + joined_outfile return joined_outfile # end annotation function ####################################################################### ####### Variant Tiering ############## ## HELPER FUNCTIONS #wrapper to call mapped processes def call_process(command): print "Processing command: "+command status = "Return code for command "+command+":"+str(os.system(command)) return status #function to add rsid to GATK output def add_rsid(intervals, in_file, out_file): f1 = open(intervals, "r") f2 = stmp_annotation_util.open_compressed_or_regular(in_file, "r") f3 = open(out_file, "w") rsdict = {} for line in f1: if ("#" in line) == 0: linelist = line.split("\t") rsdict[linelist[0]+":"+linelist[1]] = linelist[2].replace("target_", "").replace("\n", "") while 1: temp = f2.readline() if not temp: break else: if (("#" in temp) == 0): linelist = temp.split("\t") if rsdict.has_key(linelist[0]+":"+linelist[1]): f3.write(linelist[0]+"\t"+linelist[1]+"\t"+rsdict[linelist[0]+":"+linelist[1]]+"\t"+"\t".join(linelist[3:len(linelist)])) else: f3.write(temp) f1.close() f2.close() f3.close() # TIERING (VARIANT PRIORITIZATION) MAIN FUNCTION def tier(args, annotation_joined_outfile, yaml_cmds, output_dir=None, tier_name='Global'): ## MAIN CODE print 'Performing variant prioritization' if(output_dir == None): output_dir = os.path.join(args.output_dir, 'tiering_allvars') else: output_dir = os.path.join(output_dir, 'tiering_allvars') # targeted tiering only if BAM file provided? (TODO) # stmp_tiering_util.tiers_target(os.path.join(args.output_dir, 'tiers_target.tsv'), os.path.join(args.output_dir, 'tiers_target'), args.target_genes, pop=args.ethnicity) # standard VCF prioritization (tiering) stmp_tiering_util.tiers_allvars(annotation_joined_outfile, output_dir, args.target_genes, pop=args.ethnicity, yaml_cmds=yaml_cmds) # SFS filtering (TODO) if args.sfs_file != "None": print 'Performing SFS filtering (check output at ' + str(output_dir) + ')' for i in range(1,5): stmp_tiering_util.filter_sfs(output_dir+'.tier'+str(i)+'.txt', args.sfs_file, output_dir+'.tier'+str(i)+"-sfs_filtered.txt", 2) # end tiering function # real tiering function (uses above function as helper) def tier_real(args, joined_outfile, yaml_commands): #tiering is separate tiering_output_dirs = [] # 1. candidate genes (user-specified target gene list) if(args.target_genes != None): args.target_genes = general_utils.root_or_cwd(args.target_genes) candidate_out_dir = os.path.join(args.output_dir, 'Candidate') if(not os.path.isdir(candidate_out_dir)): os.makedirs(candidate_out_dir) print 'Tiering candidate genes (user-specified)' tier(args, joined_outfile, yaml_cmds=yaml_commands, output_dir=candidate_out_dir) tiering_output_dirs.append(candidate_out_dir) # 2. global global_tiering_out_dir = os.path.join(args.output_dir, 'Global') if(not os.path.isdir(global_tiering_out_dir)): os.makedirs(global_tiering_out_dir) args.target_genes = None # forces global tiering (no filtering based on a certain gene list) print 'Tiering all genes (Global)' tier(args, joined_outfile, yaml_cmds=yaml_commands, output_dir=global_tiering_out_dir) tiering_output_dirs.append(global_tiering_out_dir) #3. other gene lists (specified in YAML) tiering_gene_lists = yaml_commands[yaml_keys.kModules][yaml_keys.kTiering][yaml_keys.kTTargetGeneLists] for tiering_gene_list in tiering_gene_lists: # ignore all "clinical" gene lists if we are suppressing incidental findings (e.g. for family members) if(args.suppress_incidentals and 'clinical' in tiering_gene_list.lower()): print 'Note: Skipping tiering for ' + str(tiering_gene_list) + ' to avoid incidental findings.' continue #else out_dir = os.path.join(args.output_dir, tiering_gene_list) if(not os.path.isdir(out_dir)): os.makedirs(out_dir) args.target_genes = yaml_utils.get_abs_path(tiering_gene_lists[tiering_gene_list]) print 'Tiering ' + str(tiering_gene_list) + ' genes' tier(args, joined_outfile, yaml_cmds=yaml_commands, output_dir=out_dir) tiering_output_dirs.append(out_dir) # Generate final output as excel workbook final_tiering_out_file_path = stmp_tiering_util.tiers2xls(tiering_output_dirs, args.output_dir, yaml_commands) print '**** Tiering output written to ' + str(final_tiering_out_file_path) + ' *****' return final_tiering_out_file_path ################################################################################################# ########### Pharmgkb and ClinVar Annotation ####################### # PGX MAIN FUNCTION (PHARMGKB AND CLINVAR ANNOTATION) def pgx(args, output_dir=None): # consts if(output_dir == None): args.output = os.path.join(args.output_dir, 'pgx') else: args.output = output_dir if(not os.path.isdir(args.output)): os.makedirs(args.output) # code print str(datetime.datetime.now()) + ': Performing PharmGKB + ClinVar (pgx) annotation' #pharmgkb annotation pgxUtils.pgx_annotator(args.vcf, os.path.join(resources_path, "pgx_vars", "clinical_ann_metadata-snvs.txt"), os.path.join(args.output, "pharmacogenomics")) pgxUtils.star_caller(os.path.join(resources_path, "pgx_haps/"), args.vcf, os.path.join(args.output, "pharmacogenomics")) print str(datetime.datetime.now()) + ': Done with pgx/clinvar annotation' # end pgx function ###################### MAIN CODE (logic to call main functions) ############### #just print sql query if specified if args.print_sql_query_only: sample_db_path = stmp_annotation_util.get_sample_db_path(args.scratch_dir, stmp_annotation_util.getSampleName(args.vcf)) stmp_annotation_util.annotate_point(db_conn, args.vcf, args.scratch_dir, sample_db_path, print_sql_query_only=True, debug=args.debug_point_annotations) exit(0) # in all the cases below, output the yaml commands to the output dir yaml_utils.write_output_yaml_files(yaml_commands, args.output_dir) if args.print_region_cmd_only: stmp_annotation_util.annotate_range(db_conn, args.vcf, args.scratch_dir, modules_yaml_path=args.modules, datasets_yaml_path=args.yaml, yaml_commands=yaml_commands, print_range_cmd_only=True, force_overwrite_beds=args.clean_beds) exit(0) # pgx doesn't depend on other annotations if(args.pgx_only): pgx(args) exit(0) # tiering requires annotation to already be done if(args.tiering_only): joined_outfile = stmp_annotation_util.generateJoinedOutfilePath(args.output_dir, stmp_annotation_util.getSampleName(args.vcf)) # generates path to outfile but does not create it stmp_annotation_checker.check_annotated_output(joined_outfile) tier_real(args, joined_outfile, yaml_commands) exit(0) if(args.annotate_only): joined_outfile = annotate(args) stmp_annotation_checker.check_annotated_output(joined_outfile) exit(0) ######## DEFAULT ####### # Run full pipeline (global, clinical, candidate, secondary, pgx) #annotation is common to all modes joined_outfile = annotate(args) #check annotated output file stmp_annotation_checker.check_annotated_output(joined_outfile) # tier tier_real(args, joined_outfile, yaml_commands) # 5. pgx - goes in its own folder by default pgx(args) #remove sample from database if desired if(args.drop_sample): stmp_annotation_util.drop_sample(db_conn, args.vcf) # clean up samples in database if needed if(args.drop_samples): stmp_annotation_util.drop_samples(db_conn)