def main(): args = parse_args() bam_map = parse_bam_map_file(args.bam_map_file) vcf_reader = vcf.Reader(args.vcf_file) vep_cols = parse_vep_cols(vcf_reader) gene_list = parse_genes(args.genes) records_per_gene = parse_vcf_file(vcf_reader, vep_cols, gene_list) for gene, records in records_per_gene.items(): gene_dir = os.path.join(args.output_dir, gene) if not os.path.exists(gene_dir): os.mkdir(gene_dir) # Create script file with open(os.path.join(gene_dir, SCRIPT_NAME), "w") as sf: sf.write(generate_autoigv_cmd(args.python, args.autoigv, args.genome)) # Create positions file with open(os.path.join(gene_dir, POSITIONS_NAME), "w") as pf: pf.write(generate_autoigv_positions(gene, records, bam_map)) # Create master script file with open(os.path.join(args.output_dir, "run_all.sh"), "w") as mf: mf.write(generate_master_script()) # Create prefs file with open(os.path.join(args.output_dir, PREFS_NAME), "w") as pf: pf.write(generate_prefs())
def main(): args = parse_args() bam_map = parse_bam_map_file(args.bam_map_file) vcf_reader = vcf.Reader(args.vcf_file) vep_cols = parse_vep_cols(vcf_reader) gene_list = parse_genes(args.genes) records_per_gene = parse_vcf_file(vcf_reader, vep_cols, gene_list) for gene, records in records_per_gene.items(): gene_dir = os.path.join(args.output_dir, gene) if not os.path.exists(gene_dir): os.mkdir(gene_dir) # Create script file with open(os.path.join(gene_dir, SCRIPT_NAME), "w") as sf: sf.write( generate_autoigv_cmd(args.python, args.autoigv, args.genome)) # Create positions file with open(os.path.join(gene_dir, POSITIONS_NAME), "w") as pf: pf.write(generate_autoigv_positions(gene, records, bam_map)) # Create master script file with open(os.path.join(args.output_dir, "run_all.sh"), "w") as mf: mf.write(generate_master_script()) # Create prefs file with open(os.path.join(args.output_dir, PREFS_NAME), "w") as pf: pf.write(generate_prefs())
def main(): """Main program""" # Argument parsing args = parse_args() # Setup vcf_reader = vcf.Reader(args.input_vcf) vep_cols = parse_vep_cols(vcf_reader) # Create set of genes to be excluded excl_genes_set = build_exclude_genes(args.exclude_genes) # Create set of positions to be excluded excl_pos_set = build_exclude_positions(args.exclude_positions) # Build dict of genes with affected samples # Sets: num_samples, num_samples_mod_impact, num_samples_high_impact SampleSets = namedtuple("SampleSets", ["all", "moderate", "high"]) genes = defaultdict(lambda: SampleSets(set(), set(), set())) # Iterate over VCF file for record in vcf_reader: # Filter on position, if applicable pos_id = create_pos_id(record.CHROM, record.POS) if pos_id in excl_pos_set: continue # Filter on NUM_SAMPLES if args.max_samples and record.INFO["NUM_SAMPLES"] > args.max_samples: continue # Parse VEP output and select the first and only one vep_effect = parse_vep(vep_cols, record, tag="TOP_CSQ")[0] # Skip if symbol is absent if args.symbol and vep_effect["SYMBOL"] == "": continue # Exclude on gene ID or symbol if vep_effect["Gene"] in excl_genes_set or vep_effect[ "SYMBOL"] in excl_genes_set: continue # Extract gene ID and symbol gid, gsymbol = vep_effect["Gene"], vep_effect["SYMBOL"] # Extract calls with minimum depth calls = [] for call in record.samples: depth = getattr(call.data, "DP", 0) allele_depths = getattr(call.data, "AD", (0, 0)) if (call.gt_type != 0 and depth >= args.min_depth and allele_depths[1] / (allele_depths[0] + allele_depths[1]) < args.homo_vaf_threshold): calls.append(call) # Extract samples samples = set(c.sample for c in calls) # Add samples to genes dict; using gid and gsymbol for readability genes[(gid, gsymbol)].all.update(samples) # Update sample lists based on variant type if any([eff in vep_effect["Consequence"] for eff in HIGH_IMPACT]): genes[(gid, gsymbol)].high.update(samples) elif any([eff in vep_effect["Consequence"] for eff in MODERATE_IMPACT]): genes[(gid, gsymbol)].moderate.update(samples) # Order genes by number of affected samples genes_list = [(gene[0], gene[1], len(sets[0]), len(sets[1]), len(sets[2])) for gene, sets in genes.items()] genes_list.sort(key=lambda x: x[2], reverse=True) # Output sorted gene list header = "\t".join([ "gene_id", "gene_symbol", "num_samples", "num_samples_with_moderate_effect", "num_samples_with_high_effect" ]) + "\n" args.output.write(header) for gene in genes_list: line = "\t".join(map(str, gene)) + "\n" args.output.write(line) # Cleanup args.output.close()
def main(): """Main program""" # Argument parsing args = parse_args() # Setup vcf_reader = vcf.Reader(args.input_vcf) vep_cols = parse_vep_cols(vcf_reader) # Create set of genes to be excluded excl_genes_set = build_exclude_genes(args.exclude_genes) # Create set of positions to be excluded excl_pos_set = build_exclude_positions(args.exclude_positions) # Build dict of genes with affected samples # Sets: num_samples, num_samples_mod_impact, num_samples_high_impact SampleSets = namedtuple("SampleSets", ["all", "moderate", "high"]) genes = defaultdict(lambda: SampleSets(set(), set(), set())) # Iterate over VCF file for record in vcf_reader: # Filter on position, if applicable pos_id = create_pos_id(record.CHROM, record.POS) if pos_id in excl_pos_set: continue # Filter on NUM_SAMPLES if args.max_samples and record.INFO["NUM_SAMPLES"] > args.max_samples: continue # Parse VEP output and select the first and only one vep_effect = parse_vep(vep_cols, record, tag="TOP_CSQ")[0] # Skip if symbol is absent if args.symbol and vep_effect["SYMBOL"] == "": continue # Exclude on gene ID or symbol if vep_effect["Gene"] in excl_genes_set or vep_effect["SYMBOL"] in excl_genes_set: continue # Extract gene ID and symbol gid, gsymbol = vep_effect["Gene"], vep_effect["SYMBOL"] # Extract calls with minimum depth calls = [] for call in record.samples: depth = getattr(call.data, "DP", 0) allele_depths = getattr(call.data, "AD", (0, 0)) if (call.gt_type != 0 and depth >= args.min_depth and allele_depths[1] / (allele_depths[0] + allele_depths[1]) < args.homo_vaf_threshold): calls.append(call) # Extract samples samples = set(c.sample for c in calls) # Add samples to genes dict; using gid and gsymbol for readability genes[(gid, gsymbol)].all.update(samples) # Update sample lists based on variant type if any([eff in vep_effect["Consequence"] for eff in HIGH_IMPACT]): genes[(gid, gsymbol)].high.update(samples) elif any([eff in vep_effect["Consequence"] for eff in MODERATE_IMPACT]): genes[(gid, gsymbol)].moderate.update(samples) # Order genes by number of affected samples genes_list = [(gene[0], gene[1], len(sets[0]), len(sets[1]), len(sets[2])) for gene, sets in genes.items()] genes_list.sort(key=lambda x: x[2], reverse=True) # Output sorted gene list header = "\t".join(["gene_id", "gene_symbol", "num_samples", "num_samples_with_moderate_effect", "num_samples_with_high_effect"]) + "\n" args.output.write(header) for gene in genes_list: line = "\t".join(map(str, gene)) + "\n" args.output.write(line) # Cleanup args.output.close()