Exemple #1
0
def cli(vcf_file, split_variants, outfile, silent, verbose):
    from vcf_parser import VCFParser
    
    if vcf_file == '-':
        variant_parser = VCFParser(
                            fsock = sys.stdin, 
                            split_variants=split_variants
                        )
    else:
        variant_parser = VCFParser(
                            infile = vcf_file, 
                            split_variants=split_variants
                        )
    
    head = variant_parser.metadata
    
    add_metadata(
        head,
        'info',
        'GeneticModels', 
        annotation_number='.', 
        entry_type='String', 
        description="':'-separated list of genetic models for this variant."
        
        )
    # Test if metadata was added properly.
    print_headers(head)
Exemple #2
0
def analyze(variant_file, family_type, frequency_treshold, frequency_keyword,
            cadd_treshold, cadd_keyword, coverage, gq_treshold, outdir, silent,
            exclude_problematic, verbose):
    """Analyze the annotated variants in a VCF file. 
        
        If there are multiple families in the ped one analysis per family will
        be done. The variants are analyzed in five different categories based 
        on what inheritance patterns that are followed.
        The differen analysies are: 
        
                AR compound\n
                AR homozygote\n
                Dominant\n
                X linked\n
                Dominant dn\n
        
        Which variants to be considered are specified in the command line. 
        Defaults are (based on a rare disease assumption):
        
            MAF < 0.02\n
            CADD score > 12\n
            Coverage in all individuals > 7\n
            Call quality > 20\n
        
        The highest scoring variants of each category is printed to screen.
        The full list of each category is printed to new vcf files in a 
        directory specified by the user. Default current dir.
        File names are the same like the input vcf with the name of the 
        analysis appended.
    
    """

    start_time_analysis = datetime.now()

    # configs = ConfigObj(config_file)
    # prefered_models = make_models([])

    inheritance_keyword = 'GeneticModels'
    families = check_families(variant_file)
    file_name = os.path.splitext(os.path.split(variant_file)[-1])[0]

    # if config_file:
    #     frequency_treshold = float(configs.get('frequency', {}).get('rare', frequency_treshold))
    #     freq_keyword = configs.get('frequency', {}).get('keyword', freq_keyword)
    #     inheritance_patterns = [pattern for pattern in configs.get('inheritance', {}).get('patterns',[])]
    #     inheritance_keyword = configs.get('inheritance', {}).get('keyword',inheritance_keyword)
    #     prefered_models = make_models(inheritance_patterns)

    if variant_file == '-':
        variant_parser = VCFParser(fsock=sys.stdin)
    else:
        variant_parser = VCFParser(infile=variant_file)

    for family_id in families:
        print('Analysis for family: %s' % family_id)

        head = variant_parser.metadata

        dominant_dict = {}
        homozygote_dict = {}
        compound_dict = {}
        x_linked_dict = {}
        dominant_dn_dict = {}

        get_interesting_variants(variant_parser, family_id, dominant_dict,
                                 homozygote_dict, compound_dict, x_linked_dict,
                                 dominant_dn_dict, frequency_treshold,
                                 frequency_keyword, cadd_treshold,
                                 cadd_keyword, gq_treshold, coverage,
                                 exclude_problematic)

        remove_inacurate_compounds(compound_dict, family_id)

        if len(dominant_dict) > 0:
            dominant_file = os.path.join(outdir,
                                         file_name + '_dominant_analysis.vcf')

            print_headers(head, dominant_file)

            print_results(dominant_dict,
                          dominant_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='dominant')

        if len(homozygote_dict) > 0:
            homozygote_file = os.path.join(
                outdir, file_name + '_homozygote_analysis.vcf')
            print_headers(head, homozygote_file)

            print_results(homozygote_dict,
                          homozygote_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='homozygote')

        if len(compound_dict) > 0:
            compound_file = os.path.join(outdir,
                                         file_name + '_compound_analysis.vcf')
            print_headers(head, compound_file)

            print_results(compound_dict,
                          compound_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='compound')

        if len(x_linked_dict) > 0:
            xlinked_file = os.path.join(outdir,
                                        file_name + '_x_linked_analysis.vcf')
            print_headers(head, xlinked_file)

            print_results(x_linked_dict,
                          xlinked_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='xlinked')

        if len(dominant_dn_dict) > 0:
            dominant_dn_file = os.path.join(
                outdir, file_name + '_ad_denovo_analysis.vcf')
            print_headers(head, dominant_dn_file)

            print_results(dominant_dn_dict,
                          dominant_dn_file,
                          family_id,
                          variant_parser.header,
                          cadd_keyword,
                          frequency_keyword,
                          mode='denovo')

        print('')

        print('Number of interesting Dominant variants: %s' %
              len(dominant_dict))
        print('Number of interesting Homozygote variants: %s' %
              len(homozygote_dict))
        print('Number of interesting Compound variants: %s' %
              len(compound_dict))
        print('Number of interesting X-linked variants: %s' %
              len(x_linked_dict))
        print('Number of interesting Autosomal Dominant de novo variants: %s' %
              len(dominant_dn_dict))

        # pp(compound_dict)

        print('Time for analysis: %s' %
              str(datetime.now() - start_time_analysis))
Exemple #3
0
def analyze(
    variant_file,
    family_type,
    frequency_treshold,
    frequency_keyword,
    cadd_treshold,
    cadd_keyword,
    coverage,
    gq_treshold,
    outdir,
    silent,
    exclude_problematic,
    verbose,
):
    """Analyze the annotated variants in a VCF file. 
        
        If there are multiple families in the ped one analysis per family will
        be done. The variants are analyzed in five different categories based 
        on what inheritance patterns that are followed.
        The differen analysies are: 
        
                AR compound\n
                AR homozygote\n
                Dominant\n
                X linked\n
                Dominant dn\n
        
        Which variants to be considered are specified in the command line. 
        Defaults are (based on a rare disease assumption):
        
            MAF < 0.02\n
            CADD score > 12\n
            Coverage in all individuals > 7\n
            Call quality > 20\n
        
        The highest scoring variants of each category is printed to screen.
        The full list of each category is printed to new vcf files in a 
        directory specified by the user. Default current dir.
        File names are the same like the input vcf with the name of the 
        analysis appended.
    
    """

    start_time_analysis = datetime.now()

    # configs = ConfigObj(config_file)
    # prefered_models = make_models([])

    inheritance_keyword = "GeneticModels"
    families = check_families(variant_file)
    file_name = os.path.splitext(os.path.split(variant_file)[-1])[0]

    # if config_file:
    #     frequency_treshold = float(configs.get('frequency', {}).get('rare', frequency_treshold))
    #     freq_keyword = configs.get('frequency', {}).get('keyword', freq_keyword)
    #     inheritance_patterns = [pattern for pattern in configs.get('inheritance', {}).get('patterns',[])]
    #     inheritance_keyword = configs.get('inheritance', {}).get('keyword',inheritance_keyword)
    #     prefered_models = make_models(inheritance_patterns)

    if variant_file == "-":
        variant_parser = VCFParser(fsock=sys.stdin)
    else:
        variant_parser = VCFParser(infile=variant_file)

    for family_id in families:
        print("Analysis for family: %s" % family_id)

        head = variant_parser.metadata

        dominant_dict = {}
        homozygote_dict = {}
        compound_dict = {}
        x_linked_dict = {}
        dominant_dn_dict = {}

        get_interesting_variants(
            variant_parser,
            family_id,
            dominant_dict,
            homozygote_dict,
            compound_dict,
            x_linked_dict,
            dominant_dn_dict,
            frequency_treshold,
            frequency_keyword,
            cadd_treshold,
            cadd_keyword,
            gq_treshold,
            coverage,
            exclude_problematic,
        )

        remove_inacurate_compounds(compound_dict, family_id)

        if len(dominant_dict) > 0:
            dominant_file = os.path.join(outdir, file_name + "_dominant_analysis.vcf")

            print_headers(head, dominant_file)

            print_results(
                dominant_dict,
                dominant_file,
                family_id,
                variant_parser.header,
                cadd_keyword,
                frequency_keyword,
                mode="dominant",
            )

        if len(homozygote_dict) > 0:
            homozygote_file = os.path.join(outdir, file_name + "_homozygote_analysis.vcf")
            print_headers(head, homozygote_file)

            print_results(
                homozygote_dict,
                homozygote_file,
                family_id,
                variant_parser.header,
                cadd_keyword,
                frequency_keyword,
                mode="homozygote",
            )

        if len(compound_dict) > 0:
            compound_file = os.path.join(outdir, file_name + "_compound_analysis.vcf")
            print_headers(head, compound_file)

            print_results(
                compound_dict,
                compound_file,
                family_id,
                variant_parser.header,
                cadd_keyword,
                frequency_keyword,
                mode="compound",
            )

        if len(x_linked_dict) > 0:
            xlinked_file = os.path.join(outdir, file_name + "_x_linked_analysis.vcf")
            print_headers(head, xlinked_file)

            print_results(
                x_linked_dict,
                xlinked_file,
                family_id,
                variant_parser.header,
                cadd_keyword,
                frequency_keyword,
                mode="xlinked",
            )

        if len(dominant_dn_dict) > 0:
            dominant_dn_file = os.path.join(outdir, file_name + "_ad_denovo_analysis.vcf")
            print_headers(head, dominant_dn_file)

            print_results(
                dominant_dn_dict,
                dominant_dn_file,
                family_id,
                variant_parser.header,
                cadd_keyword,
                frequency_keyword,
                mode="denovo",
            )

        print("")

        print("Number of interesting Dominant variants: %s" % len(dominant_dict))
        print("Number of interesting Homozygote variants: %s" % len(homozygote_dict))
        print("Number of interesting Compound variants: %s" % len(compound_dict))
        print("Number of interesting X-linked variants: %s" % len(x_linked_dict))
        print("Number of interesting Autosomal Dominant de novo variants: %s" % len(dominant_dn_dict))

        # pp(compound_dict)

        print("Time for analysis: %s" % str(datetime.now() - start_time_analysis))