Python parse_vep_cols 예제들, annotate_vcf_on_cohort.parse_vep_cols Python 예제들

예제 #1

0

파일 보기

파일: autoigv_per_gene.py 프로젝트: morinlab/lab_scripts

def main():
    args = parse_args()
    bam_map = parse_bam_map_file(args.bam_map_file)
    vcf_reader = vcf.Reader(args.vcf_file)
    vep_cols = parse_vep_cols(vcf_reader)
    gene_list = parse_genes(args.genes)
    records_per_gene = parse_vcf_file(vcf_reader, vep_cols, gene_list)
    for gene, records in records_per_gene.items():
        gene_dir = os.path.join(args.output_dir, gene)
        if not os.path.exists(gene_dir):
            os.mkdir(gene_dir)
        # Create script file
        with open(os.path.join(gene_dir, SCRIPT_NAME), "w") as sf:
            sf.write(generate_autoigv_cmd(args.python, args.autoigv, args.genome))
        # Create positions file
        with open(os.path.join(gene_dir, POSITIONS_NAME), "w") as pf:
            pf.write(generate_autoigv_positions(gene, records, bam_map))
    # Create master script file
    with open(os.path.join(args.output_dir, "run_all.sh"), "w") as mf:
        mf.write(generate_master_script())
    # Create prefs file
    with open(os.path.join(args.output_dir, PREFS_NAME), "w") as pf:
        pf.write(generate_prefs())

예제 #2

0

파일 보기

파일: autoigv_per_gene.py 프로젝트: jaysonwujq/lab_scripts

def main():
    args = parse_args()
    bam_map = parse_bam_map_file(args.bam_map_file)
    vcf_reader = vcf.Reader(args.vcf_file)
    vep_cols = parse_vep_cols(vcf_reader)
    gene_list = parse_genes(args.genes)
    records_per_gene = parse_vcf_file(vcf_reader, vep_cols, gene_list)
    for gene, records in records_per_gene.items():
        gene_dir = os.path.join(args.output_dir, gene)
        if not os.path.exists(gene_dir):
            os.mkdir(gene_dir)
        # Create script file
        with open(os.path.join(gene_dir, SCRIPT_NAME), "w") as sf:
            sf.write(
                generate_autoigv_cmd(args.python, args.autoigv, args.genome))
        # Create positions file
        with open(os.path.join(gene_dir, POSITIONS_NAME), "w") as pf:
            pf.write(generate_autoigv_positions(gene, records, bam_map))
    # Create master script file
    with open(os.path.join(args.output_dir, "run_all.sh"), "w") as mf:
        mf.write(generate_master_script())
    # Create prefs file
    with open(os.path.join(args.output_dir, PREFS_NAME), "w") as pf:
        pf.write(generate_prefs())

예제 #3

0

파일 보기

def main():
    """Main program"""

    # Argument parsing
    args = parse_args()

    # Setup
    vcf_reader = vcf.Reader(args.input_vcf)
    vep_cols = parse_vep_cols(vcf_reader)

    # Create set of genes to be excluded
    excl_genes_set = build_exclude_genes(args.exclude_genes)

    # Create set of positions to be excluded
    excl_pos_set = build_exclude_positions(args.exclude_positions)

    # Build dict of genes with affected samples
    # Sets: num_samples, num_samples_mod_impact, num_samples_high_impact
    SampleSets = namedtuple("SampleSets", ["all", "moderate", "high"])
    genes = defaultdict(lambda: SampleSets(set(), set(), set()))

    # Iterate over VCF file
    for record in vcf_reader:
        # Filter on position, if applicable
        pos_id = create_pos_id(record.CHROM, record.POS)
        if pos_id in excl_pos_set:
            continue
        # Filter on NUM_SAMPLES
        if args.max_samples and record.INFO["NUM_SAMPLES"] > args.max_samples:
            continue
        # Parse VEP output and select the first and only one
        vep_effect = parse_vep(vep_cols, record, tag="TOP_CSQ")[0]
        # Skip if symbol is absent
        if args.symbol and vep_effect["SYMBOL"] == "":
            continue
        # Exclude on gene ID or symbol
        if vep_effect["Gene"] in excl_genes_set or vep_effect[
                "SYMBOL"] in excl_genes_set:
            continue
        # Extract gene ID and symbol
        gid, gsymbol = vep_effect["Gene"], vep_effect["SYMBOL"]
        # Extract calls with minimum depth
        calls = []
        for call in record.samples:
            depth = getattr(call.data, "DP", 0)
            allele_depths = getattr(call.data, "AD", (0, 0))
            if (call.gt_type != 0 and depth >= args.min_depth
                    and allele_depths[1] /
                (allele_depths[0] + allele_depths[1]) <
                    args.homo_vaf_threshold):
                calls.append(call)
        # Extract samples
        samples = set(c.sample for c in calls)
        # Add samples to genes dict; using gid and gsymbol for readability
        genes[(gid, gsymbol)].all.update(samples)
        # Update sample lists based on variant type
        if any([eff in vep_effect["Consequence"] for eff in HIGH_IMPACT]):
            genes[(gid, gsymbol)].high.update(samples)
        elif any([eff in vep_effect["Consequence"]
                  for eff in MODERATE_IMPACT]):
            genes[(gid, gsymbol)].moderate.update(samples)

    # Order genes by number of affected samples
    genes_list = [(gene[0], gene[1], len(sets[0]), len(sets[1]), len(sets[2]))
                  for gene, sets in genes.items()]
    genes_list.sort(key=lambda x: x[2], reverse=True)

    # Output sorted gene list
    header = "\t".join([
        "gene_id", "gene_symbol", "num_samples",
        "num_samples_with_moderate_effect", "num_samples_with_high_effect"
    ]) + "\n"
    args.output.write(header)
    for gene in genes_list:
        line = "\t".join(map(str, gene)) + "\n"
        args.output.write(line)

    # Cleanup
    args.output.close()

예제 #4

0

파일 보기

파일: tabulate_genes.py 프로젝트: morinlab/lab_scripts

def main():
    """Main program"""

    # Argument parsing
    args = parse_args()

    # Setup
    vcf_reader = vcf.Reader(args.input_vcf)
    vep_cols = parse_vep_cols(vcf_reader)

    # Create set of genes to be excluded
    excl_genes_set = build_exclude_genes(args.exclude_genes)

    # Create set of positions to be excluded
    excl_pos_set = build_exclude_positions(args.exclude_positions)

    # Build dict of genes with affected samples
    # Sets: num_samples, num_samples_mod_impact, num_samples_high_impact
    SampleSets = namedtuple("SampleSets", ["all", "moderate", "high"])
    genes = defaultdict(lambda: SampleSets(set(), set(), set()))

    # Iterate over VCF file
    for record in vcf_reader:
        # Filter on position, if applicable
        pos_id = create_pos_id(record.CHROM, record.POS)
        if pos_id in excl_pos_set:
            continue
        # Filter on NUM_SAMPLES
        if args.max_samples and record.INFO["NUM_SAMPLES"] > args.max_samples:
            continue
        # Parse VEP output and select the first and only one
        vep_effect = parse_vep(vep_cols, record, tag="TOP_CSQ")[0]
        # Skip if symbol is absent
        if args.symbol and vep_effect["SYMBOL"] == "":
            continue
        # Exclude on gene ID or symbol
        if vep_effect["Gene"] in excl_genes_set or vep_effect["SYMBOL"] in excl_genes_set:
            continue
        # Extract gene ID and symbol
        gid, gsymbol = vep_effect["Gene"], vep_effect["SYMBOL"]
        # Extract calls with minimum depth
        calls = []
        for call in record.samples:
            depth = getattr(call.data, "DP", 0)
            allele_depths = getattr(call.data, "AD", (0, 0))
            if (call.gt_type != 0 and depth >= args.min_depth and allele_depths[1] / (allele_depths[0] + allele_depths[1]) < args.homo_vaf_threshold):
                calls.append(call)
        # Extract samples
        samples = set(c.sample for c in calls)
        # Add samples to genes dict; using gid and gsymbol for readability
        genes[(gid, gsymbol)].all.update(samples)
        # Update sample lists based on variant type
        if any([eff in vep_effect["Consequence"] for eff in HIGH_IMPACT]):
            genes[(gid, gsymbol)].high.update(samples)
        elif any([eff in vep_effect["Consequence"] for eff in MODERATE_IMPACT]):
            genes[(gid, gsymbol)].moderate.update(samples)

    # Order genes by number of affected samples
    genes_list = [(gene[0], gene[1], len(sets[0]), len(sets[1]), len(sets[2])) for gene, sets in genes.items()]
    genes_list.sort(key=lambda x: x[2], reverse=True)

    # Output sorted gene list
    header = "\t".join(["gene_id", "gene_symbol", "num_samples", "num_samples_with_moderate_effect", "num_samples_with_high_effect"]) + "\n"
    args.output.write(header)
    for gene in genes_list:
        line = "\t".join(map(str, gene)) + "\n"
        args.output.write(line)

    # Cleanup
    args.output.close()