Ejemplo n.º 1
0
def add_cadd(header):
    """Add cadd annotation to vcf header"""
    logger.info("Adding 'CADD' to vcf header")
    add_metadata(header,
                 'info',
                 'CADD',
                 annotation_number='1',
                 entry_type='Integer',
                 description="The CADD relative score for this alternative.")
    return
Ejemplo n.º 2
0
def add_cadd_raw(header):
    """Add cadd annotation to vcf header"""
    logger.info("Adding 'CADD' to vcf header")
    add_metadata(header,
                 'info',
                 'CADD_raw',
                 annotation_number='1',
                 entry_type='Float',
                 description="The CADD raw score(s) for this alternative(s).")
    return
Ejemplo n.º 3
0
def add_spidex(header):
    """Add spidex annotation to vcf header"""
    logger.info("Adding 'SPIDEX' to vcf header")
    add_metadata(header,
                 'info',
                 'SPIDEX',
                 annotation_number='1',
                 entry_type='Float',
                 description="Z score from the spidex database.")
    return
Ejemplo n.º 4
0
def add_thousandg(header):
    """Add thousand genomes annotation to vcf header"""
    logger.info("Adding '1000GAF' to vcf header")
    add_metadata(header,
                 'info',
                 '1000GAF',
                 annotation_number='1',
                 entry_type='Float',
                 description="Frequency in the 1000G database.")
    return
Ejemplo n.º 5
0
def add_thousandg_max(header):
    """Add thousand genomes max annotation to vcf header"""
    logger.info("Adding '1000G_MAX_AF' to vcf header")
    add_metadata(header,
                 'info',
                 '1000G_MAX_AF',
                 annotation_number='1',
                 entry_type='Float',
                 description="The max af for thousand genomes populations.")
    return
Ejemplo n.º 6
0
def add_exac(header):
    """Add exac annotation to vcf header"""
    logger.info("Adding 'EXACAF' to vcf header")
    add_metadata(header,
                 'info',
                 'EXACAF',
                 annotation_number='1',
                 entry_type='Float',
                 description="Frequency in the ExAC database.")
    return
Ejemplo n.º 7
0
def add_exac_max(header):
    """Add exac annotation to vcf header"""
    logger.info("Adding 'EXAC_MAX_AF' to vcf header")
    add_metadata(header,
                 'info',
                 'EXAC_MAX_AF',
                 annotation_number='1',
                 entry_type='Float',
                 description="The max af for ExAC populations.")
    return
Ejemplo n.º 8
0
def add_cosmic(header):
    """Add cosmic annotation to vcf header"""
    logger.info("Adding 'COSMIC' to vcf header")
    add_metadata(header,
                 'info',
                 'COSMIC',
                 annotation_number='0',
                 entry_type='Flag',
                 description="If variant is in COSMIC database.")
    return
Ejemplo n.º 9
0
def add_regions(header):
    """Add region annotations to header"""
    logger.info("Adding 'Annotation' to vcf header")
    add_metadata(
        header,
        'info',
        'Annotation',
        annotation_number='.',
        entry_type='String',
        description='Annotates what feature(s) this variant belongs to.')
    return
Ejemplo n.º 10
0
def models(variant_file, family_file, family_type, reduced_penetrance, vep,
keyword, phased, strict, silent, processes, whole_gene, outfile, temp_dir):
    """
    Annotate genetic models for vcf variants. 
    
    Checks what patterns of inheritance that are followed in a VCF file.
    The analysis is family based so each family that are specified in the family
    file and exists in the variant file will get it's own annotation.
    """
    logger = logging.getLogger(__name__)
    
    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i+'='+str(values[i]) for i in values if values[i] and 
        i not in ['frame']
    ]
    
    ###########################################################################
    
    logger.info("Running GENMOD annotate version {0}".format(__version__))
    logger.debug("Arguments: {0}".format(', '.join(argument_list)))
    
    reduced_penetrance_genes = set()
    nr_reduced_penetrance_genes = 0
    if reduced_penetrance:
        logger.info("Found file with genes that have reduced penetrance")
        for line in reduced_penetrance:
            if not line.startswith('#'):
                nr_reduced_penetrance_genes += 1
                gene_id = line.rstrip().split()[0]
                logger.debug("Adding gene {0} to reduced penetrance genes".format(
                    gene_id
                ))
                reduced_penetrance_genes.add(
                    gene_id
                )
    
        logger.info("Found {0} genes with reduced penetrance".format(
            nr_reduced_penetrance_genes))
    
    
    if not family_file:
        logger.warning("Please provide a family file with -f/--family_file")
        logger.info("Exiting")
        sys.exit(1)
    
    logger.info("Setting up a family parser")
    family_parser = FamilyParser(family_file, family_type)
    logger.debug("Family parser done")
    
    families = {}
    logger.info("Check if the familys have any affected")
    for family_id in family_parser.families:
        found_affected = False
        family_obj = family_parser.families[family_id]
        for ind_id in family_obj.individuals:
            ind_obj = family_obj.individuals[ind_id]
            if ind_obj.affected:
                found_affected = True
        
        if found_affected:
            families[family_id] = family_obj
        else:
            logger.warning("No affected individuals found for family {0}."\
                           " Skipping family.".format(family_id))
    
    if not families:
        logger.warning("Please provide at least one family with affected individuals")
        sys.exit(0)
    # The individuals in the ped file must be present in the variant file:
    logger.info("Families used in analysis: {0}".format(
                    ','.join(list(families.keys()))))
    logger.info("Individuals included in analysis: {0}".format(
                    ','.join(list(family_parser.individuals.keys()))))
    
    
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    #Add the first variant to the iterator
    variant_file = itertools.chain([line], variant_file)
    
    if vep:
        if not "CSQ" in head.info_dict:
            logger.warning("vep flag is used but there is no CSQ field specified in header")
            logger.info("Please check VCF file")
            logger.info("Exiting...")
            sys.exit(1)
        else:
            logger.info("Using VEP annotation")
    else:
        if not keyword in head.info_dict:
            logger.warning("Annotation key {0} could not be found in VCF header".format(keyword))
            logger.info("Please check VCF file")
            logger.info("Exiting...")
            sys.exit(1)
        else:
            logger.info("Using {0} annotation".format(keyword))
        
    
    if "GeneticModels" in head.info_dict:
        logger.warning("Genetic models are already annotated according to vcf"\
        " header.")
        logger.info("Exiting...")
        sys.exit(1)
    
    logger.info("Adding genmod version to vcf header")
    head.add_version_tracking(
                    info_id='genmod',
                    version=__version__,
                    date=datetime.now().strftime("%Y-%m-%d %H:%M"),
                    command_line=' '.join(argument_list)
                )
    
    logger.debug("Version added")
    logger.info("Adding genetic models to vcf header")
    add_metadata(
        head,
        'info',
        'GeneticModels',
        annotation_number='.',
        entry_type='String',
        description="':'-separated list of genetic models for this variant."
    )
    
    logger.debug("Genetic models added")
    logger.info("Adding model score to vcf header")
    add_metadata(
        head,
        'info',
        'ModelScore',
        annotation_number='.',
        entry_type='String',
        description="PHRED score for genotype models."
    )
    logger.debug("Model score added")
    
    logger.info("Adding Compounds to vcf header")
    add_metadata(
        head,
        'info',
        'Compounds',
        annotation_number='.',
        entry_type='String',
        description=("List of compound pairs for this variant."
        "The list is splitted on ',' family id is separated with compounds"
        "with ':'. Compounds are separated with '|'.")
    )
    logger.debug("Compounds added")
    
    vcf_individuals = head.individuals
    logger.debug("Individuals found in vcf file: {}".format(', '.join(vcf_individuals)))
    

    start_time_analysis = datetime.now()
    
    try:
        check_individuals(family_parser.individuals, vcf_individuals)
    except IOError as e:
        logger.error(e)
        logger.info("Individuals in PED file: {0}".format(
                        ', '.join(family_parser.individuals)))
        logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals)))
        logger.info("Exiting...")
        sys.exit(1)

    analysis_individuals = list(family_parser.individuals.keys())
    
    logger.info("Individuals used in analysis: {0}".format(
        ', '.join(analysis_individuals)))
    
    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    variant_queue = JoinableQueue(maxsize=1000)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_model_checkers))


    # These are the workers that do the heavy part of the analysis
    logger.info('Seting up the workers')
    model_checkers = [
        VariantAnnotator(
            task_queue=variant_queue,
            results_queue=results,
            families=families,
            individuals=analysis_individuals,
            phased=phased,
            strict=strict,
            whole_gene=whole_gene,
            vep=vep,
            reduced_penetrance_genes = reduced_penetrance_genes
        )
        for i in range(num_model_checkers)
    ]
    logger.info('Starting the workers')
    for worker in model_checkers:
        logger.debug('Starting worker {0}'.format(worker))
        worker.start()

    # This process prints the variants to temporary files
    logger.info('Seting up the variant printer')
    if len(model_checkers) == 1:
        print_headers(head=head, outfile=outfile, silent=silent)
        variant_printer = VariantPrinter(
                task_queue=results,
                head=head,
                mode='normal',
                outfile = outfile
        )
    else:
        # We use a temp file to store the processed variants
        logger.debug("Build a tempfile for printing the variants")
        if temp_dir:
            temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
        else:
            temp_file = NamedTemporaryFile(delete=False)
        temp_file.close()
        
        variant_printer = VariantPrinter(
                task_queue=results,
                head=head,
                mode='chromosome',
                outfile = temp_file.name
        )
    
    logger.info('Starting the variant printer process')
    variant_printer.start()

    start_time_variant_parsing = datetime.now()
    
    # This process parses the original vcf and create batches to put in the variant queue:
    logger.info('Start parsing the variants')
    chromosome_list = get_batches(
                                variants = variant_file,
                                batch_queue = variant_queue,
                                header = head,
                                vep = vep,
                                annotation_keyword = keyword
                            )
    
    logger.debug("Put stop signs in the variant queue")
    for i in range(num_model_checkers):
        variant_queue.put(None)
    
    variant_queue.join()
    results.put(None)
    variant_printer.join()
    
    if len(model_checkers) > 1:
        sort_variants(infile=temp_file.name, mode='chromosome')

        print_headers(head=head, outfile=outfile, silent=silent)

        with open(temp_file.name, 'r', encoding='utf-8') as f:
            for line in f:
                print_variant(
                    variant_line=line,
                    outfile=outfile,
                    mode='modified',
                    silent=silent
                )
    
        logger.debug("Removing temp file")
        os.remove(temp_file.name)
        logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Ejemplo n.º 11
0
def score(variant_file, family_id, family_file, family_type, score_config, 
silent, skip_plugin_check, rank_results, outfile):
    """
    Score variants in a vcf file using a Weighted Sum Model.
    
    The specific scores should be defined in a config file, see examples on 
    github.
    """
    
    logger = logging.getLogger(__name__)
    
    logger.info('Running GENMOD score, version: {0}'.format(__version__))
    
    logger.info("Checking family id")
    
    if family_file:
        logger.info("Setting up a family parser")
        family_parser = FamilyParser(family_file, family_type)
        logger.debug("Family parser done")
        family_id = list(family_parser.families.keys())[0]
    
    logger.info("Family used in analysis: {0}".format(family_id))
    
    ## Check the score config:
    if not score_config:
        logger.warning("Please provide a score config file.")
        logger.info("Exiting")
        sys.exit(1)
    
    logger.debug("Parsing config file")
    
    try:
        config_parser = ConfigParser(score_config)
    except ValidateError as e:
        logger.error(e.message)
        logger.info("Exiting")
        sys.exit(1)
    
    score_categories = list(config_parser.categories.keys())

    logger.debug("Config parsed succesfully")

    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    logger.info("Check if all score plugins exist in vcf ...")
    if not check_plugins(config_parser, head):
        if not skip_plugin_check:
            logger.error("All score plugins has to be defined in vcf header")
            logger.info("Exiting")
            sys.exit(1)
    else:
        logger.info("All plugins are defined in vcf")
    
    #Add the first variant to the iterator
    variant_file = itertools.chain([line], variant_file)
    header_line = head.header
    
    if "RankScore" in head.info_dict:
        logger.warning("Variants already scored according to VCF header")
        logger.info("Please check VCF file")
        logger.info("Exiting...")
        sys.exit(1)
    
    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.', 
        entry_type='String', 
        description="The rank score for this variant in this family. family_id:rank_score."
    )
    
    if rank_results:
        add_metadata(
            head,
            'info',
            'RankResult',
            annotation_number='.', 
            entry_type='String', 
            description= '|'.join(score_categories)
        )
        
    print_headers(
        head=head,
        outfile=outfile,
        silent=silent
    )
    start_scoring = datetime.now()
    last_twenty = datetime.now()
    nr_of_variants = 1

    for line in variant_file:
        if not line.startswith('#'):
            variant = get_variant_dict(line, header_line)
            variant['info_dict'] = get_info_dict(variant['INFO'])
            rank_score = 0
            # This is for printing results to vcf:
            category_scores = []
            for category in score_categories:
                category_score = get_category_score(variant, category, config_parser)
                logger.debug("Adding category score {0} to rank_score".format(category_score))
                
                rank_score += category_score
                logger.debug("Updating rank score to {0}".format(rank_score))
                
                category_scores.append(str(category_score))
                
            
            variant = add_vcf_info(
                keyword = 'RankScore',
                variant_dict=variant,
                annotation="{0}:{1}".format(family_id, rank_score)
            )
            
            if rank_results:
                variant = add_vcf_info(
                    keyword = 'RankResult',
                    variant_dict=variant,
                    annotation="|".join(category_scores)
                )
                

            print_variant(
                variant_dict=variant,
                header_line=header_line,
                outfile=outfile,
                silent=silent
            )

            nr_of_variants += 1

            if nr_of_variants % 20000 == 0:
                logger.info("{0} variants scored.".format(nr_of_variants))
                logger.info("Last 20000 took {0} to score.".format(datetime.now()-last_twenty))
                last_twenty = datetime.now()

    logger.info("Variants scored. Number of variants: {0}".format(nr_of_variants))
    logger.info("Time to score variants: {0}".format(datetime.now()-start_scoring))
Ejemplo n.º 12
0
def score(context, variant_file, family_id, family_file, family_type,
          score_config, silent, skip_plugin_check, rank_results, outfile):
    """
    Score variants in a vcf file using a Weighted Sum Model.
    
    The specific scores should be defined in a config file, see examples on 
    github.
    """
    logger.info('Running GENMOD score, version: {0}'.format(__version__))

    logger.info("Checking family id")

    variant_file = get_file_handle(variant_file)

    if family_file:
        logger.info("Setting up a family parser")
        family_parser = FamilyParser(family_file, family_type)
        logger.debug("Family parser done")
        family_id = list(family_parser.families.keys())[0]

    logger.info("Family used in analysis: {0}".format(family_id))

    ## Check the score config:
    if not score_config:
        logger.warning("Please provide a score config file.")
        context.abort()

    logger.debug("Parsing config file")

    try:
        config_parser = ConfigParser(score_config)
    except ValidateError as e:
        logger.error(e.message)
        context.abort()

    score_categories = list(config_parser.categories.keys())

    logger.debug("Config parsed succesfully")

    logger.info("Initializing a Header Parser")
    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    logger.info("Check if all score plugins exist in vcf ...")
    if not check_plugins(config_parser, head):
        if not skip_plugin_check:
            logger.error("All score plugins has to be defined in vcf header")
            context.abort()
    else:
        logger.info("All plugins are defined in vcf")

    csq_format = head.vep_columns
    #Add the first variant to the iterator
    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    header_line = head.header

    if "RankScore" in head.info_dict:
        logger.warning("Variants already scored according to VCF header")
        logger.info("Please check VCF file")
        context.abort()

    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.',
        entry_type='String',
        description=
        "The rank score for this variant in this family. family_id:rank_score."
    )

    if rank_results:
        add_metadata(head,
                     'info',
                     'RankResult',
                     annotation_number='.',
                     entry_type='String',
                     description='|'.join(score_categories))

    print_headers(head=head, outfile=outfile, silent=silent)
    start_scoring = datetime.now()
    last_twenty = datetime.now()
    nr_of_variants = 1

    for line in variant_file:
        if not line.startswith('#'):
            variant = get_variant_dict(line, header_line)
            variant['info_dict'] = get_info_dict(variant['INFO'])
            rank_score = 0
            # This is for printing results to vcf:
            category_scores = []
            for category in score_categories:
                category_score = get_category_score(
                    variant=variant,
                    category=category,
                    config_parser=config_parser,
                    csq_format=csq_format)
                logger.debug("Adding category score {0} to rank_score".format(
                    category_score))

                rank_score += category_score
                logger.debug("Updating rank score to {0}".format(rank_score))

                category_scores.append(str(category_score))

            variant = add_vcf_info(keyword='RankScore',
                                   variant_dict=variant,
                                   annotation="{0}:{1}".format(
                                       family_id, rank_score))

            if rank_results:
                variant = add_vcf_info(keyword='RankResult',
                                       variant_dict=variant,
                                       annotation="|".join(category_scores))

            print_variant(variant_dict=variant,
                          header_line=header_line,
                          outfile=outfile,
                          silent=silent)

            nr_of_variants += 1

            if nr_of_variants % 20000 == 0:
                logger.info("{0} variants scored.".format(nr_of_variants))
                logger.info(
                    "Last 20000 took {0} to score.".format(datetime.now() -
                                                           last_twenty))
                last_twenty = datetime.now()

    logger.info(
        "Variants scored. Number of variants: {0}".format(nr_of_variants))
    logger.info("Time to score variants: {0}".format(datetime.now() -
                                                     start_scoring))
Ejemplo n.º 13
0
def models(context, variant_file, family_file, family_type, reduced_penetrance,
           vep, keyword, phased, strict, silent, processes, outfile, temp_dir,
           whole_gene):
    """
    Annotate genetic models for vcf variants. 
    
    Checks what patterns of inheritance that are followed in a VCF file.
    The analysis is family based so each family that are specified in the family
    file and exists in the variant file will get it's own annotation.
    """

    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i not in ['frame']
    ]

    variant_file = get_file_handle(variant_file)
    ###########################################################################

    logger.info(
        "Running GENMOD annotate models version {0}".format(__version__))
    logger.debug("Arguments: {0}".format(', '.join(argument_list)))

    reduced_penetrance_genes = set()
    nr_reduced_penetrance_genes = 0
    if reduced_penetrance:
        logger.info("Found file with genes that have reduced penetrance")
        for line in reduced_penetrance:
            if not line.startswith('#'):
                nr_reduced_penetrance_genes += 1
                gene_id = line.rstrip().split()[0]
                logger.debug(
                    "Adding gene {0} to reduced penetrance genes".format(
                        gene_id))
                reduced_penetrance_genes.add(gene_id)

        logger.info("Found {0} genes with reduced penetrance".format(
            nr_reduced_penetrance_genes))

    if not family_file:
        logger.warning("Please provide a family file with -f/--family_file")
        context.abort()

    logger.info("Setting up a family parser")
    family_parser = FamilyParser(family_file, family_type)
    logger.debug("Family parser done")

    families = {}
    logger.info("Check if the familys have any affected")
    for family_id in family_parser.families:
        found_affected = False
        family_obj = family_parser.families[family_id]
        for ind_id in family_obj.individuals:
            ind_obj = family_obj.individuals[ind_id]
            if ind_obj.affected:
                found_affected = True

        if found_affected:
            families[family_id] = family_obj
        else:
            logger.warning("No affected individuals found for family {0}."\
                           " Skipping family.".format(family_id))

    if not families:
        logger.warning(
            "Please provide at least one family with affected individuals")
        context.abort()
    # The individuals in the ped file must be present in the variant file:
    logger.info("Families used in analysis: {0}".format(','.join(
        list(families.keys()))))
    logger.info("Individuals included in analysis: {0}".format(','.join(
        list(family_parser.individuals.keys()))))

    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    #Add the first variant to the iterator
    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    if vep:
        if not "CSQ" in head.info_dict:
            logger.warning(
                "vep flag is used but there is no CSQ field specified in header"
            )
            logger.info("Please check VCF file")
            context.abort()
        else:
            logger.info("Using VEP annotation")
    else:
        if not keyword in head.info_dict:
            logger.warning(
                "Annotation key {0} could not be found in VCF header".format(
                    keyword))
            logger.info("Please check VCF file")
            context.abort()
        else:
            logger.info("Using {0} annotation".format(keyword))

    if "GeneticModels" in head.info_dict:
        logger.warning("Genetic models are already annotated according to vcf"\
        " header.")
        context.abort()

    logger.info("Adding genmod version to vcf header")
    head.add_version_tracking(info_id='genmod',
                              version=__version__,
                              date=datetime.now().strftime("%Y-%m-%d %H:%M"),
                              command_line=' '.join(argument_list))

    logger.debug("Version added")
    logger.info("Adding genetic models to vcf header")
    add_metadata(
        head,
        'info',
        'GeneticModels',
        annotation_number='.',
        entry_type='String',
        description="':'-separated list of genetic models for this variant.")

    logger.debug("Genetic models added")
    logger.info("Adding model score to vcf header")
    add_metadata(head,
                 'info',
                 'ModelScore',
                 annotation_number='.',
                 entry_type='String',
                 description="PHRED score for genotype models.")
    logger.debug("Model score added")

    logger.info("Adding Compounds to vcf header")
    add_metadata(
        head,
        'info',
        'Compounds',
        annotation_number='.',
        entry_type='String',
        description=(
            "List of compound pairs for this variant."
            "The list is splitted on ',' family id is separated with compounds"
            "with ':'. Compounds are separated with '|'."))
    logger.debug("Compounds added")

    vcf_individuals = head.individuals
    logger.debug("Individuals found in vcf file: {}".format(
        ', '.join(vcf_individuals)))

    try:
        check_individuals(family_parser.individuals, vcf_individuals)
    except IOError as e:
        logger.error(e)
        logger.info("Individuals in PED file: {0}".format(', '.join(
            family_parser.individuals)))
        logger.info("Individuals in VCF file: {0}".format(
            ', '.join(vcf_individuals)))

        context.abort()

    start_time_analysis = datetime.now()

    analysis_individuals = list(family_parser.individuals.keys())

    logger.info("Individuals used in analysis: {0}".format(
        ', '.join(analysis_individuals)))

    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    # One batch consists of all variants from one or several overlapping genes
    # there can be a significant amount of variants in a batch for whole genome
    # data...
    variant_queue = JoinableQueue(maxsize=100)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_model_checkers))

    # These are the workers that do the heavy part of the analysis
    logger.info('Seting up the workers')
    try:
        model_checkers = [
            VariantAnnotator(task_queue=variant_queue,
                             results_queue=results,
                             families=families,
                             individuals=analysis_individuals,
                             phased=phased,
                             strict=strict,
                             vep=vep,
                             reduced_penetrance_genes=reduced_penetrance_genes)
            for i in range(num_model_checkers)
        ]
        logger.info('Starting the workers')
        for worker in model_checkers:
            logger.debug('Starting worker {0}'.format(worker))
            worker.start()

        # This process prints the variants to temporary files
        logger.info('Seting up the variant printer')
        if len(model_checkers) == 1:
            print_headers(head=head, outfile=outfile, silent=silent)
            variant_printer = VariantPrinter(task_queue=results,
                                             head=head,
                                             mode='normal',
                                             outfile=outfile)
        else:
            # We use a temp file to store the processed variants
            logger.debug("Build a tempfile for printing the variants")
            if temp_dir:
                temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
            else:
                temp_file = NamedTemporaryFile(delete=False)
            temp_file.close()

            variant_printer = VariantPrinter(task_queue=results,
                                             head=head,
                                             mode='chromosome',
                                             outfile=temp_file.name)

        logger.info('Starting the variant printer process')
        variant_printer.start()

        start_time_variant_parsing = datetime.now()

        # This process parses the original vcf and create batches to put in the variant queue:
        logger.info('Start parsing the variants')
        chromosome_list = get_batches(variants=variant_file,
                                      batch_queue=variant_queue,
                                      header=head,
                                      vep=vep,
                                      annotation_keyword=keyword)

        logger.debug("Put stop signs in the variant queue")
        for i in range(num_model_checkers):
            variant_queue.put(None)

        variant_queue.join()
        results.put(None)
        variant_printer.join()

        if len(model_checkers) > 1:
            sort_variants(infile=temp_file.name, mode='chromosome')

            print_headers(head=head, outfile=outfile, silent=silent)

            with open(temp_file.name, 'r', encoding='utf-8') as f:
                for line in f:
                    print_variant(variant_line=line,
                                  outfile=outfile,
                                  mode='modified',
                                  silent=silent)

    except Exception as err:
        logger.warning(err)
        for worker in model_checkers:
            worker.terminate()
        variant_printer.terminate()
        context.abort()
    finally:
        if len(model_checkers) > 1:
            logger.info("Removing temp file")
            os.remove(temp_file.name)
            logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Ejemplo n.º 14
0
def annotate(variant_file, annotate_regions, cadd_file, thousand_g, exac, 
spidex,annotation_dir, outfile, silent, cadd_raw, cosmic, max_af, processes,
temp_dir):
    """
    Annotate vcf variants.
    
    Annotate variants with a number of different sources.
    Please use --help for more info.
    """

    logger.info("Running genmod annotate_variant version {0}".format(__version__))
    
    start_time_analysis = datetime.now()
    annotator_arguments = {}
    
    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    line = None
    for line in variant_file:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    #Add the first variant to the iterator
    if line:
        variant_file = itertools.chain([line], variant_file)
    
    header_line = head.header
    annotator_arguments['header_line'] = header_line
    
    if annotate_regions:
        logger.info("Loading annotations")
        gene_trees, exon_trees = load_annotations(annotation_dir)
        annotator_arguments['gene_trees'] = gene_trees
        annotator_arguments['exon_trees'] = exon_trees
        
        add_metadata(
            head,
            'info',
            'Annotation',
            annotation_number='.',
            entry_type='String',
            description='Annotates what feature(s) this variant belongs to.'
        )
        add_metadata(
            head,
            'info',
            'Exonic',
            annotation_number='0',
            entry_type='Flag',
            description='Indicates if the variant is exonic.'
        )

    if exac:
        logger.info("Annotating ExAC frequencies")
        logger.debug("Using ExAC file: {0}".format(exac))
        annotator_arguments['exac'] = exac
        add_metadata(
            head,
            'info',
            'ExACAF',
            annotation_number='1',
            entry_type='Float',
            description="Frequency in the ExAC database."
        )
        
    if thousand_g:
        logger.info("Annotating 1000G frequencies")
        logger.debug("Using 1000G file: {0}".format(thousand_g))
        annotator_arguments['thousand_g'] = thousand_g
        add_metadata(
            head,
            'info',
            '1000GAF',
            annotation_number='1',
            entry_type='Float',
            description="Frequency in the 1000G database."
        )

    if spidex:
        logger.info("Annotating Spidex z scores")
        logger.debug("Using Spidex file: {0}".format(spidex))
        annotator_arguments['spidex'] = spidex
        add_metadata(
            head,
            'info',
            'SPIDEX',
            annotation_number='1',
            entry_type='Float',
            description="Z score from the spidex database."
        )
    
    if cadd_file:
        logger.info("Annotating CADD scores")
        logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file)))
        annotator_arguments['cadd_files'] = cadd_file
        any_cadd_file = True

        add_metadata(
            head,
            'info',
            'CADD',
            annotation_number='1',
            entry_type='Integer',
            description="The CADD relative score for this alternative."
        )
        if cadd_raw:
            annotator_arguments['cadd_raw'] = cadd_raw
            logger.debug("Adding vcf metadata for CADD raw score")
            add_metadata(
                head,
                'info',
                'CADD_raw',
                annotation_number='1',
                entry_type='Float',
                description="The CADD raw score(s) for this alternative(s)."
            )

    if max_af:
        annotator_arguments['max_af'] = max_af
        if thousand_g:
            add_metadata(
                head,
                'info',
                '1000G_MAX_AF',
                annotation_number='1',
                entry_type='Float',
                description="The max af for thousand genomes populations."
            )
        if exac:
            add_metadata(
                head,
                'info',
                'ExAC_MAX_AF',
                annotation_number='1',
                entry_type='Float',
                description="The max af for ExAC populations."
            )

    if cosmic:
        logger.info("Annotating if variant is in COSMIC")
        logger.debug("Using COSMOC file: {0}".format(cosmic))
        annotator_arguments['cosmic'] = cosmic
        add_metadata(
            head,
            'info',
            'COSMIC',
            annotation_number='0',
            entry_type='Flag',
            description="If variant is in COSMIC database."
        )
    
    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    variant_queue = JoinableQueue(maxsize=1000)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_annotators = processes
    #Adapt the number of processes to the machine that run the analysis
    if cadd_file or spidex:
        # We need more power when annotating cadd scores:
        # But if flag is used that overrides
        if num_annotators == min(4, cpu_count()):
            num_annotators = min(8, cpu_count())

    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_annotators))


    # These are the workers that do the heavy part of the analysis
    logger.info('Setting up the workers')
    annotators = [
        VariantAnnotator(
            variant_queue, 
            results, 
            **annotator_arguments
        )
        for i in range(num_annotators)
    ]

    logger.info('Starting the workers')
    for worker in annotators:
        logger.debug('Starting worker {0}'.format(worker))
        worker.start()

    # This process prints the variants to temporary files
    # If there is only one annotation process we can print the results as soon
    # as they are done
    logger.info('Setting up the variant printer')
    if len(annotators) == 1:
        print_headers(head, outfile, silent)
        var_printer = VariantPrinter(
                        task_queue = results, 
                        head = head, 
                        mode='normal', 
                        outfile = outfile
                        )
    else:
        # We use a temp file to store the processed variants
        logger.debug("Build a tempfile for printing the variants")
        if temp_dir:
            temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
        else:
            temp_file = NamedTemporaryFile(delete=False)
            
        temp_file.close()
        
        var_printer = VariantPrinter(
                        task_queue = results, 
                        head = head, 
                        mode='chromosome', 
                        outfile = temp_file.name
                        )
    
    logger.info('Starting the variant printer process')
    var_printer.start()

    start_time_variant_parsing = datetime.now()
    start_time_twenty = datetime.now()
    nr_of_lines = 0
    # This process parses the original vcf and create batches to put in the variant queue:
    logger.info('Start parsing the variants')
    
    for line in variant_file:
        line = line.rstrip()
        
        if not line.startswith('#'):
            variant_queue.put(line)
            
            nr_of_lines += 1
            
            if nr_of_lines % 20000 == 0:
                logger.info('{0} variants parsed'.format(nr_of_lines))
                logger.info('Last 20000 took {0} to parse'.format(
                    datetime.now()-start_time_twenty))
                start_time_twenty = datetime.now()
    
    logger.info('Put stop signs in the variant queue')
    
    for i in range(num_annotators):
        variant_queue.put(None)

    variant_queue.join()
    results.put(None)
    var_printer.join()

    if len(annotators) > 1:
        logger.info("Start sorting the variants")
        sort_variants(temp_file.name, mode='chromosome')

        logger.info("Print the headers")
        print_headers(head, outfile, silent)

        with open(temp_file.name, 'r', encoding='utf-8') as f:
            for line in f:
                print_variant(
                    variant_line=line,
                    outfile=outfile,
                    mode='modified',
                    silent=silent
                )

        logger.info("Removing temp file")
        os.remove(temp_file.name)
        logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))