def run(self):
        """Starts the printing"""
        # Print the results to a temporary file:
        number_of_finished = 0
        proc_name = self.name
        self.logger.info(('{0}: starting'.format(proc_name)))
        
        if self.outfile:
            if isinstance(self.outfile, str):
                self.outfile = open(self.outfile, 'w+', encoding="utf-8")
        
        while True:
            
            # A task is a variant dictionary
            self.logger.debug(('{0} fetching next variant'.format(proc_name)))
            variant = self.task_queue.get()
            
            if self.task_queue.full():
                self.logger.warning('Variant queue full')
            
            if variant is None:
                self.logger.info('All variants printed.')
                if self.outfile:
                    self.outfile.close()
                break
            
            self.logger.debug("Printing variant {0}".format(variant.get('variant_id', 'unknown')))
            
            priority = None
            
            if self.mode == 'chromosome': 
                priority = get_chromosome_priority(variant['CHROM'])

            elif self.mode == 'score': 
                priority = get_rank_score(variant_dict=variant)
            
            
            print_variant(variant_dict=variant, header_line=self.header, 
                          priority=priority, outfile=self.outfile, 
                          silent=self.silent)
        
        return
Exemple #2
0
    def run(self):
        """Starts the printing"""
        # Print the results to a temporary file:
        number_of_finished = 0
        proc_name = self.name
        self.logger.info(('{0}: starting'.format(proc_name)))
        
        if self.outfile:
            if isinstance(self.outfile, str):
                self.outfile = open(self.outfile, 'w+', encoding="utf-8-sig")
        
        while True:
            
            # A task is a variant dictionary
            self.logger.debug(('{0} fetching next variant'.format(proc_name)))
            variant = self.task_queue.get()
            
            if self.task_queue.full():
                self.logger.warning('Variant queue full')
            
            if variant is None:
                self.logger.info('All variants printed.')
                if self.outfile:
                    self.outfile.close()
                break
            
            self.logger.debug("Printing variant {0}".format(variant.get('variant_id', 'unknown')))
            
            priority = None
            
            if self.mode == 'chromosome': 
                priority = get_chromosome_priority(variant['CHROM'])

            elif self.mode == 'score': 
                priority = get_rank_score(variant_dict=variant)
            
            
            print_variant(variant_dict=variant, header_line=self.header, 
                          priority=priority, outfile=self.outfile, 
                          silent=self.silent)
        
        return
Exemple #3
0
def annotate(context, variant_file, annotate_regions, region_file, cadd_file,
             thousand_g, exac, spidex, outfile, silent, cadd_raw, cosmic,
             max_af, temp_dir, genome_build):
    """
    Annotate vcf variants.
    
    Annotate variants with a number of different sources.
    Please use --help for more info.
    """
    regions = annotate_regions
    logger.info(
        "Running genmod annotate_variant version {0}".format(__version__))

    if not region_file:
        if genome_build == '37':
            region_file = ensembl_path_37
        elif genome_build == '38':
            region_file = ensembl_path_38

    start_time_analysis = datetime.now()
    annotation_arguments = {}

    variants = get_file_handle(variant_file)

    logger.info("Initializing a Header Parser")
    head = HeaderParser()

    line = None
    for line in variants:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    #Add the first variant back to the iterator
    # If the vcf has no variants the last line will be a header
    if not line.startswith('#'):
        variants = itertools.chain([line], variants)
    else:
        print_headers(head, outfile, silent)
        sys.exit(0)

    header_line = head.header
    annotation_arguments['header_line'] = header_line

    try:
        if regions:
            logger.info("Loading annotations")
            logger.info("Use annotations file: {0}".format(region_file))
            add_regions(head)
            regions_handle = get_file_handle(region_file)
            logger.debug("Adding region trees to arguments")
            annotation_arguments['region_trees'] = build_region_trees(
                regions_handle, padding=4000)

        if exac:
            logger.info("Annotating ExAC frequencies")
            logger.debug("Using ExAC file: {0}".format(exac))
            annotation_arguments['exac'] = get_tabixhandle(exac)
            add_exac(head)

        if thousand_g:
            logger.info("Annotating 1000G frequencies")
            logger.debug("Using 1000G file: {0}".format(thousand_g))
            annotation_arguments['thousand_g'] = get_tabixhandle(thousand_g)
            add_thousandg(head)

        if spidex:
            logger.info("Annotating Spidex z scores")
            logger.debug("Using Spidex file: {0}".format(spidex))
            annotation_arguments['spidex'] = get_tabixhandle(spidex)
            add_spidex(head)

        if cadd_file:
            logger.info("Annotating CADD scores")
            logger.debug("Using CADD file(s): {0}".format(
                ', '.join(cadd_file)))
            annotation_arguments['cadd_files'] = [
                get_tabixhandle(cadd) for cadd in cadd_file
            ]

            add_cadd(head)

            if cadd_raw:
                annotation_arguments['cadd_raw'] = cadd_raw
                add_cadd_raw(head)

        if max_af:
            annotation_arguments['max_af'] = max_af
            if thousand_g:
                add_thousandg_max(head)
            if exac:
                add_exac_max(head)

        if cosmic:
            logger.info("Annotating if variant is in COSMIC")
            logger.debug("Using COSMIC file: {0}".format(cosmic))
            annotation_arguments['cosmic'] = get_tabixhandle(cosmic)
            add_cosmic(head)
    except TabixError as err:
        logger.warning(err)
        context.abort()

    print_headers(head, outfile, silent)

    for variant in variants:
        print_variant(variant_line=annotate_variant(variant,
                                                    annotation_arguments),
                      outfile=outfile,
                      silent=silent)
Exemple #4
0
def models(variant_file, family_file, family_type, reduced_penetrance, vep,
keyword, phased, strict, silent, processes, whole_gene, outfile, temp_dir):
    """
    Annotate genetic models for vcf variants. 
    
    Checks what patterns of inheritance that are followed in a VCF file.
    The analysis is family based so each family that are specified in the family
    file and exists in the variant file will get it's own annotation.
    """
    logger = logging.getLogger(__name__)
    
    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i+'='+str(values[i]) for i in values if values[i] and 
        i not in ['frame']
    ]
    
    ###########################################################################
    
    logger.info("Running GENMOD annotate version {0}".format(__version__))
    logger.debug("Arguments: {0}".format(', '.join(argument_list)))
    
    reduced_penetrance_genes = set()
    nr_reduced_penetrance_genes = 0
    if reduced_penetrance:
        logger.info("Found file with genes that have reduced penetrance")
        for line in reduced_penetrance:
            if not line.startswith('#'):
                nr_reduced_penetrance_genes += 1
                gene_id = line.rstrip().split()[0]
                logger.debug("Adding gene {0} to reduced penetrance genes".format(
                    gene_id
                ))
                reduced_penetrance_genes.add(
                    gene_id
                )
    
        logger.info("Found {0} genes with reduced penetrance".format(
            nr_reduced_penetrance_genes))
    
    
    if not family_file:
        logger.warning("Please provide a family file with -f/--family_file")
        logger.info("Exiting")
        sys.exit(1)
    
    logger.info("Setting up a family parser")
    family_parser = FamilyParser(family_file, family_type)
    logger.debug("Family parser done")
    
    families = {}
    logger.info("Check if the familys have any affected")
    for family_id in family_parser.families:
        found_affected = False
        family_obj = family_parser.families[family_id]
        for ind_id in family_obj.individuals:
            ind_obj = family_obj.individuals[ind_id]
            if ind_obj.affected:
                found_affected = True
        
        if found_affected:
            families[family_id] = family_obj
        else:
            logger.warning("No affected individuals found for family {0}."\
                           " Skipping family.".format(family_id))
    
    if not families:
        logger.warning("Please provide at least one family with affected individuals")
        sys.exit(0)
    # The individuals in the ped file must be present in the variant file:
    logger.info("Families used in analysis: {0}".format(
                    ','.join(list(families.keys()))))
    logger.info("Individuals included in analysis: {0}".format(
                    ','.join(list(family_parser.individuals.keys()))))
    
    
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    #Add the first variant to the iterator
    variant_file = itertools.chain([line], variant_file)
    
    if vep:
        if not "CSQ" in head.info_dict:
            logger.warning("vep flag is used but there is no CSQ field specified in header")
            logger.info("Please check VCF file")
            logger.info("Exiting...")
            sys.exit(1)
        else:
            logger.info("Using VEP annotation")
    else:
        if not keyword in head.info_dict:
            logger.warning("Annotation key {0} could not be found in VCF header".format(keyword))
            logger.info("Please check VCF file")
            logger.info("Exiting...")
            sys.exit(1)
        else:
            logger.info("Using {0} annotation".format(keyword))
        
    
    if "GeneticModels" in head.info_dict:
        logger.warning("Genetic models are already annotated according to vcf"\
        " header.")
        logger.info("Exiting...")
        sys.exit(1)
    
    logger.info("Adding genmod version to vcf header")
    head.add_version_tracking(
                    info_id='genmod',
                    version=__version__,
                    date=datetime.now().strftime("%Y-%m-%d %H:%M"),
                    command_line=' '.join(argument_list)
                )
    
    logger.debug("Version added")
    logger.info("Adding genetic models to vcf header")
    add_metadata(
        head,
        'info',
        'GeneticModels',
        annotation_number='.',
        entry_type='String',
        description="':'-separated list of genetic models for this variant."
    )
    
    logger.debug("Genetic models added")
    logger.info("Adding model score to vcf header")
    add_metadata(
        head,
        'info',
        'ModelScore',
        annotation_number='.',
        entry_type='String',
        description="PHRED score for genotype models."
    )
    logger.debug("Model score added")
    
    logger.info("Adding Compounds to vcf header")
    add_metadata(
        head,
        'info',
        'Compounds',
        annotation_number='.',
        entry_type='String',
        description=("List of compound pairs for this variant."
        "The list is splitted on ',' family id is separated with compounds"
        "with ':'. Compounds are separated with '|'.")
    )
    logger.debug("Compounds added")
    
    vcf_individuals = head.individuals
    logger.debug("Individuals found in vcf file: {}".format(', '.join(vcf_individuals)))
    

    start_time_analysis = datetime.now()
    
    try:
        check_individuals(family_parser.individuals, vcf_individuals)
    except IOError as e:
        logger.error(e)
        logger.info("Individuals in PED file: {0}".format(
                        ', '.join(family_parser.individuals)))
        logger.info("Individuals in VCF file: {0}".format(', '.join(vcf_individuals)))
        logger.info("Exiting...")
        sys.exit(1)

    analysis_individuals = list(family_parser.individuals.keys())
    
    logger.info("Individuals used in analysis: {0}".format(
        ', '.join(analysis_individuals)))
    
    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    variant_queue = JoinableQueue(maxsize=1000)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_model_checkers))


    # These are the workers that do the heavy part of the analysis
    logger.info('Seting up the workers')
    model_checkers = [
        VariantAnnotator(
            task_queue=variant_queue,
            results_queue=results,
            families=families,
            individuals=analysis_individuals,
            phased=phased,
            strict=strict,
            whole_gene=whole_gene,
            vep=vep,
            reduced_penetrance_genes = reduced_penetrance_genes
        )
        for i in range(num_model_checkers)
    ]
    logger.info('Starting the workers')
    for worker in model_checkers:
        logger.debug('Starting worker {0}'.format(worker))
        worker.start()

    # This process prints the variants to temporary files
    logger.info('Seting up the variant printer')
    if len(model_checkers) == 1:
        print_headers(head=head, outfile=outfile, silent=silent)
        variant_printer = VariantPrinter(
                task_queue=results,
                head=head,
                mode='normal',
                outfile = outfile
        )
    else:
        # We use a temp file to store the processed variants
        logger.debug("Build a tempfile for printing the variants")
        if temp_dir:
            temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
        else:
            temp_file = NamedTemporaryFile(delete=False)
        temp_file.close()
        
        variant_printer = VariantPrinter(
                task_queue=results,
                head=head,
                mode='chromosome',
                outfile = temp_file.name
        )
    
    logger.info('Starting the variant printer process')
    variant_printer.start()

    start_time_variant_parsing = datetime.now()
    
    # This process parses the original vcf and create batches to put in the variant queue:
    logger.info('Start parsing the variants')
    chromosome_list = get_batches(
                                variants = variant_file,
                                batch_queue = variant_queue,
                                header = head,
                                vep = vep,
                                annotation_keyword = keyword
                            )
    
    logger.debug("Put stop signs in the variant queue")
    for i in range(num_model_checkers):
        variant_queue.put(None)
    
    variant_queue.join()
    results.put(None)
    variant_printer.join()
    
    if len(model_checkers) > 1:
        sort_variants(infile=temp_file.name, mode='chromosome')

        print_headers(head=head, outfile=outfile, silent=silent)

        with open(temp_file.name, 'r', encoding='utf-8') as f:
            for line in f:
                print_variant(
                    variant_line=line,
                    outfile=outfile,
                    mode='modified',
                    silent=silent
                )
    
        logger.debug("Removing temp file")
        os.remove(temp_file.name)
        logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Exemple #5
0
def annotate(context, variant_file, annotate_regions, region_file, cadd_file, 
             thousand_g, exac, spidex, outfile, silent, cadd_raw, cosmic, 
             max_af, temp_dir, genome_build):
    """
    Annotate vcf variants.
    
    Annotate variants with a number of different sources.
    Please use --help for more info.
    """
    regions = annotate_regions
    logger.info("Running genmod annotate_variant version {0}".format(__version__))
    
    if not region_file:
        if genome_build == '37':
            region_file = ensembl_path_37
        elif genome_build == '38':
            region_file = ensembl_path_38
    
    start_time_analysis = datetime.now()
    annotation_arguments = {}
    
    variants = get_file_handle(variant_file)
    
    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    line = None
    for line in variants:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    #Add the first variant back to the iterator
    # If the vcf has no variants the last line will be a header
    if not line.startswith('#'):
        variants = itertools.chain([line], variants)
    else:
        print_headers(head, outfile, silent)
        sys.exit(0)
    
    header_line = head.header
    annotation_arguments['header_line'] = header_line
    
    try:
        if regions:
            logger.info("Loading annotations")
            logger.info("Use annotations file: {0}".format(region_file))
            add_regions(head)
            regions_handle = get_file_handle(region_file)
            logger.debug("Adding region trees to arguments")
            annotation_arguments['region_trees'] = build_region_trees(regions_handle, padding=4000)
        
        if exac:
            logger.info("Annotating ExAC frequencies")
            logger.debug("Using ExAC file: {0}".format(exac))
            annotation_arguments['exac'] = get_tabixhandle(exac)
            add_exac(head)
        
        if thousand_g:
            logger.info("Annotating 1000G frequencies")
            logger.debug("Using 1000G file: {0}".format(thousand_g))
            annotation_arguments['thousand_g'] = get_tabixhandle(thousand_g)
            add_thousandg(head)
        
        if spidex:
            logger.info("Annotating Spidex z scores")
            logger.debug("Using Spidex file: {0}".format(spidex))
            annotation_arguments['spidex'] = get_tabixhandle(spidex)
            add_spidex(head)
        
        if cadd_file:
            logger.info("Annotating CADD scores")
            logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file)))
            annotation_arguments['cadd_files'] = [get_tabixhandle(cadd) for cadd in cadd_file]
            
            add_cadd(head)
        
            if cadd_raw:
                annotation_arguments['cadd_raw'] = cadd_raw
                add_cadd_raw(head)
        
        if max_af:
            annotation_arguments['max_af'] = max_af
            if thousand_g:
                add_thousandg_max(head)
            if exac:
                add_exac_max(head)
        
        if cosmic:
            logger.info("Annotating if variant is in COSMIC")
            logger.debug("Using COSMIC file: {0}".format(cosmic))
            annotation_arguments['cosmic'] = get_tabixhandle(cosmic)
            add_cosmic(head)
    except TabixError as err:
        logger.warning(err)
        context.abort()
    
    print_headers(head, outfile, silent)
    
    for variant in variants:
        print_variant(
            variant_line = annotate_variant(variant, annotation_arguments),
            outfile = outfile,
            silent = silent
        )
def compound(context, variant_file, silent, outfile, vep, processes, temp_dir):
    """
    Score compound variants in a vcf file based on their rank score.
    """
    logger.info(
        'Running GENMOD score_compounds, version: {0}'.format(__version__))

    variant_file = get_file_handle(variant_file)

    start_time_analysis = datetime.now()
    logger.info("Initializing a Header Parser")
    head = HeaderParser()

    line = None
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    logger.info("Headers parsed")

    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    header_line = head.header
    individuals = head.individuals

    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    variant_queue = JoinableQueue(maxsize=1000)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_scorers = processes
    #Adapt the number of processes to the machine that run the analysis
    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_scorers))

    # These are the workers that do the heavy part of the analysis
    logger.info('Seting up the workers')
    compound_scorers = [
        CompoundScorer(
            task_queue=variant_queue,
            results_queue=results,
            individuals=individuals,
        ) for i in range(num_scorers)
    ]

    try:
        logger.info('Starting the workers')
        for worker in compound_scorers:
            logger.debug('Starting worker {0}'.format(worker))
            worker.start()

        # This process prints the variants to temporary files
        logger.info('Seting up the variant printer')

        # We use a temp file to store the processed variants
        logger.debug("Build a tempfile for printing the variants")
        if temp_dir:
            temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
        else:
            temp_file = NamedTemporaryFile(delete=False)
        temp_file.close()

        variant_printer = VariantPrinter(task_queue=results,
                                         head=head,
                                         mode='chromosome',
                                         outfile=temp_file.name)

        logger.info('Starting the variant printer process')
        variant_printer.start()

        start_time_variant_parsing = datetime.now()

        # This process parses the original vcf and create batches to put in the variant queue:
        chromosome_list = get_batches(variants=variant_file,
                                      batch_queue=variant_queue,
                                      header=head,
                                      vep=vep,
                                      results_queue=results)

        logger.debug("Put stop signs in the variant queue")
        for i in range(num_scorers):
            variant_queue.put(None)

        variant_queue.join()
        results.put(None)
        variant_printer.join()

        sort_variants(infile=temp_file.name, mode='chromosome')

        print_headers(head=head, outfile=outfile, silent=silent)

        with open(temp_file.name, 'r', encoding='utf-8') as f:
            for line in f:
                print_variant(variant_line=line,
                              outfile=outfile,
                              mode='modified',
                              silent=silent)
    except Exception as e:
        logger.warning(e)
        for worker in compound_scorers:
            worker.terminate()
        variant_printer.terminate()
        context.abort()
    finally:
        logger.info("Removing temp file")
        os.remove(temp_file.name)
        logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Exemple #7
0
def sort(variant_file, outfile, family_id, silent, position, temp_dir):
    """
    Sort a VCF file based on rank score.
    """    
    logger = logging.getLogger(__name__)
    head = HeaderParser()

    logger.info("Running GENMOD sort version {0}".format(__version__))
    start = datetime.now()
    # Create a temporary variant file for sorting
    logger.debug("Creating temporary file for sorting")
    if temp_dir:
        temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
    else:
        temp_file = NamedTemporaryFile(delete=False)
    temp_file.close()
    # Open the temp file with codecs
    temp_file_handle = open(
                                temp_file.name,
                                mode='w',
                                encoding='utf-8',
                                errors='replace'
                                )
    logger.debug("Temp file created")
    logger.info("Printing variants to temp file")
    nr_variants = 0
    # Print the variants with rank score in first column
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            nr_variants += 1
            priority = '0'
            
            if position:
                chrom = line.split()[0]
                priority = get_chromosome_priority(chrom)
            else:
                priority = get_rank_score(line)
            
            print_variant(
                variant_line=line, 
                priority=priority, 
                outfile=temp_file_handle
            )
    
    temp_file_handle.close()
    
    logger.info("Variants printed to temp file")
    logger.info("Nr or variants in VCF file: {0}".format(nr_variants))
    
    sort_mode = 'rank'
    
    if position:
        sort_mode = 'chromosome'
    
    logger.info("Sorting variants")
    sort_variants(
        infile = temp_file.name, 
        mode=sort_mode
    )
    logger.info("Variants sorted")

    logger.debug("Printing headers")
    print_headers(
        head = head, 
        outfile = outfile, 
        silent=silent
    )
    logger.debug("Headers printed")
    
    logger.info("Printing variants")
    with open(temp_file.name, mode='r', encoding='utf-8', errors='replace') as f:
        for variant_line in f:
            print_variant(
                variant_line = variant_line, 
                outfile = outfile, 
                mode = 'modified',
                silent=False
                )
    logger.debug("Variants printed")
    
    logger.info("Removing temp file")
    os.remove(temp_file.name)
    logger.debug("Temp file removed")
    
    logger.info("Sorting done, time for sorting: {0}".format(datetime.now()-start))
Exemple #8
0
def sort(variant_file, outfile, family_id, silent, position, temp_dir):
    """
    Sort a VCF file based on rank score.
    """
    head = HeaderParser()
    variant_file = get_file_handle(variant_file)
    logger.info("Running GENMOD sort version {0}".format(__version__))
    start = datetime.now()
    # Create a temporary variant file for sorting
    logger.debug("Creating temporary file for sorting")
    if temp_dir:
        temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
    else:
        temp_file = NamedTemporaryFile(delete=False)
    temp_file.close()
    # Open the temp file with codecs
    temp_file_handle = open(temp_file.name,
                            mode='w',
                            encoding='utf-8',
                            errors='replace')
    logger.debug("Temp file created")
    logger.info("Printing variants to temp file")
    nr_variants = 0
    # Print the variants with rank score in first column
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            nr_variants += 1
            priority = '0'

            if position:
                chrom = line.split()[0]
                priority = get_chromosome_priority(chrom)
            else:
                priority = get_rank_score(line)

            print_variant(variant_line=line,
                          priority=priority,
                          outfile=temp_file_handle)

    temp_file_handle.close()

    logger.info("Variants printed to temp file")
    logger.info("Nr or variants in VCF file: {0}".format(nr_variants))

    sort_mode = 'rank'

    if nr_variants == 0:
        logger.debug("Printing headers")
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    if position:
        sort_mode = 'chromosome'

    logger.info("Sorting variants")
    sort_variants(infile=temp_file.name, mode=sort_mode)
    logger.info("Variants sorted")

    logger.debug("Printing headers")
    print_headers(head=head, outfile=outfile, silent=silent)
    logger.debug("Headers printed")

    logger.info("Printing variants")
    with open(temp_file.name, mode='r', encoding='utf-8',
              errors='replace') as f:
        for variant_line in f:
            print_variant(variant_line=variant_line,
                          outfile=outfile,
                          mode='modified',
                          silent=False)
    logger.debug("Variants printed")

    logger.info("Removing temp file")
    os.remove(temp_file.name)
    logger.debug("Temp file removed")

    logger.info("Sorting done, time for sorting: {0}".format(datetime.now() -
                                                             start))
Exemple #9
0
def score(variant_file, family_id, family_file, family_type, score_config, 
silent, skip_plugin_check, rank_results, outfile):
    """
    Score variants in a vcf file using a Weighted Sum Model.
    
    The specific scores should be defined in a config file, see examples on 
    github.
    """
    
    logger = logging.getLogger(__name__)
    
    logger.info('Running GENMOD score, version: {0}'.format(__version__))
    
    logger.info("Checking family id")
    
    if family_file:
        logger.info("Setting up a family parser")
        family_parser = FamilyParser(family_file, family_type)
        logger.debug("Family parser done")
        family_id = list(family_parser.families.keys())[0]
    
    logger.info("Family used in analysis: {0}".format(family_id))
    
    ## Check the score config:
    if not score_config:
        logger.warning("Please provide a score config file.")
        logger.info("Exiting")
        sys.exit(1)
    
    logger.debug("Parsing config file")
    
    try:
        config_parser = ConfigParser(score_config)
    except ValidateError as e:
        logger.error(e.message)
        logger.info("Exiting")
        sys.exit(1)
    
    score_categories = list(config_parser.categories.keys())

    logger.debug("Config parsed succesfully")

    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    logger.info("Check if all score plugins exist in vcf ...")
    if not check_plugins(config_parser, head):
        if not skip_plugin_check:
            logger.error("All score plugins has to be defined in vcf header")
            logger.info("Exiting")
            sys.exit(1)
    else:
        logger.info("All plugins are defined in vcf")
    
    #Add the first variant to the iterator
    variant_file = itertools.chain([line], variant_file)
    header_line = head.header
    
    if "RankScore" in head.info_dict:
        logger.warning("Variants already scored according to VCF header")
        logger.info("Please check VCF file")
        logger.info("Exiting...")
        sys.exit(1)
    
    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.', 
        entry_type='String', 
        description="The rank score for this variant in this family. family_id:rank_score."
    )
    
    if rank_results:
        add_metadata(
            head,
            'info',
            'RankResult',
            annotation_number='.', 
            entry_type='String', 
            description= '|'.join(score_categories)
        )
        
    print_headers(
        head=head,
        outfile=outfile,
        silent=silent
    )
    start_scoring = datetime.now()
    last_twenty = datetime.now()
    nr_of_variants = 1

    for line in variant_file:
        if not line.startswith('#'):
            variant = get_variant_dict(line, header_line)
            variant['info_dict'] = get_info_dict(variant['INFO'])
            rank_score = 0
            # This is for printing results to vcf:
            category_scores = []
            for category in score_categories:
                category_score = get_category_score(variant, category, config_parser)
                logger.debug("Adding category score {0} to rank_score".format(category_score))
                
                rank_score += category_score
                logger.debug("Updating rank score to {0}".format(rank_score))
                
                category_scores.append(str(category_score))
                
            
            variant = add_vcf_info(
                keyword = 'RankScore',
                variant_dict=variant,
                annotation="{0}:{1}".format(family_id, rank_score)
            )
            
            if rank_results:
                variant = add_vcf_info(
                    keyword = 'RankResult',
                    variant_dict=variant,
                    annotation="|".join(category_scores)
                )
                

            print_variant(
                variant_dict=variant,
                header_line=header_line,
                outfile=outfile,
                silent=silent
            )

            nr_of_variants += 1

            if nr_of_variants % 20000 == 0:
                logger.info("{0} variants scored.".format(nr_of_variants))
                logger.info("Last 20000 took {0} to score.".format(datetime.now()-last_twenty))
                last_twenty = datetime.now()

    logger.info("Variants scored. Number of variants: {0}".format(nr_of_variants))
    logger.info("Time to score variants: {0}".format(datetime.now()-start_scoring))
Exemple #10
0
def score(context, variant_file, family_id, family_file, family_type,
          score_config, silent, skip_plugin_check, rank_results, outfile):
    """
    Score variants in a vcf file using a Weighted Sum Model.
    
    The specific scores should be defined in a config file, see examples on 
    github.
    """
    logger.info('Running GENMOD score, version: {0}'.format(__version__))

    logger.info("Checking family id")

    variant_file = get_file_handle(variant_file)

    if family_file:
        logger.info("Setting up a family parser")
        family_parser = FamilyParser(family_file, family_type)
        logger.debug("Family parser done")
        family_id = list(family_parser.families.keys())[0]

    logger.info("Family used in analysis: {0}".format(family_id))

    ## Check the score config:
    if not score_config:
        logger.warning("Please provide a score config file.")
        context.abort()

    logger.debug("Parsing config file")

    try:
        config_parser = ConfigParser(score_config)
    except ValidateError as e:
        logger.error(e.message)
        context.abort()

    score_categories = list(config_parser.categories.keys())

    logger.debug("Config parsed succesfully")

    logger.info("Initializing a Header Parser")
    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    logger.info("Check if all score plugins exist in vcf ...")
    if not check_plugins(config_parser, head):
        if not skip_plugin_check:
            logger.error("All score plugins has to be defined in vcf header")
            context.abort()
    else:
        logger.info("All plugins are defined in vcf")

    csq_format = head.vep_columns
    #Add the first variant to the iterator
    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    header_line = head.header

    if "RankScore" in head.info_dict:
        logger.warning("Variants already scored according to VCF header")
        logger.info("Please check VCF file")
        context.abort()

    add_metadata(
        head,
        'info',
        'RankScore',
        annotation_number='.',
        entry_type='String',
        description=
        "The rank score for this variant in this family. family_id:rank_score."
    )

    if rank_results:
        add_metadata(head,
                     'info',
                     'RankResult',
                     annotation_number='.',
                     entry_type='String',
                     description='|'.join(score_categories))

    print_headers(head=head, outfile=outfile, silent=silent)
    start_scoring = datetime.now()
    last_twenty = datetime.now()
    nr_of_variants = 1

    for line in variant_file:
        if not line.startswith('#'):
            variant = get_variant_dict(line, header_line)
            variant['info_dict'] = get_info_dict(variant['INFO'])
            rank_score = 0
            # This is for printing results to vcf:
            category_scores = []
            for category in score_categories:
                category_score = get_category_score(
                    variant=variant,
                    category=category,
                    config_parser=config_parser,
                    csq_format=csq_format)
                logger.debug("Adding category score {0} to rank_score".format(
                    category_score))

                rank_score += category_score
                logger.debug("Updating rank score to {0}".format(rank_score))

                category_scores.append(str(category_score))

            variant = add_vcf_info(keyword='RankScore',
                                   variant_dict=variant,
                                   annotation="{0}:{1}".format(
                                       family_id, rank_score))

            if rank_results:
                variant = add_vcf_info(keyword='RankResult',
                                       variant_dict=variant,
                                       annotation="|".join(category_scores))

            print_variant(variant_dict=variant,
                          header_line=header_line,
                          outfile=outfile,
                          silent=silent)

            nr_of_variants += 1

            if nr_of_variants % 20000 == 0:
                logger.info("{0} variants scored.".format(nr_of_variants))
                logger.info(
                    "Last 20000 took {0} to score.".format(datetime.now() -
                                                           last_twenty))
                last_twenty = datetime.now()

    logger.info(
        "Variants scored. Number of variants: {0}".format(nr_of_variants))
    logger.info("Time to score variants: {0}".format(datetime.now() -
                                                     start_scoring))
Exemple #11
0
def models(context, variant_file, family_file, family_type, reduced_penetrance,
           vep, keyword, phased, strict, silent, processes, outfile, temp_dir,
           whole_gene):
    """
    Annotate genetic models for vcf variants. 
    
    Checks what patterns of inheritance that are followed in a VCF file.
    The analysis is family based so each family that are specified in the family
    file and exists in the variant file will get it's own annotation.
    """

    ######### This is for logging the command line string #########
    frame = inspect.currentframe()
    args, _, _, values = inspect.getargvalues(frame)
    argument_list = [
        i + '=' + str(values[i]) for i in values
        if values[i] and i not in ['frame']
    ]

    variant_file = get_file_handle(variant_file)
    ###########################################################################

    logger.info(
        "Running GENMOD annotate models version {0}".format(__version__))
    logger.debug("Arguments: {0}".format(', '.join(argument_list)))

    reduced_penetrance_genes = set()
    nr_reduced_penetrance_genes = 0
    if reduced_penetrance:
        logger.info("Found file with genes that have reduced penetrance")
        for line in reduced_penetrance:
            if not line.startswith('#'):
                nr_reduced_penetrance_genes += 1
                gene_id = line.rstrip().split()[0]
                logger.debug(
                    "Adding gene {0} to reduced penetrance genes".format(
                        gene_id))
                reduced_penetrance_genes.add(gene_id)

        logger.info("Found {0} genes with reduced penetrance".format(
            nr_reduced_penetrance_genes))

    if not family_file:
        logger.warning("Please provide a family file with -f/--family_file")
        context.abort()

    logger.info("Setting up a family parser")
    family_parser = FamilyParser(family_file, family_type)
    logger.debug("Family parser done")

    families = {}
    logger.info("Check if the familys have any affected")
    for family_id in family_parser.families:
        found_affected = False
        family_obj = family_parser.families[family_id]
        for ind_id in family_obj.individuals:
            ind_obj = family_obj.individuals[ind_id]
            if ind_obj.affected:
                found_affected = True

        if found_affected:
            families[family_id] = family_obj
        else:
            logger.warning("No affected individuals found for family {0}."\
                           " Skipping family.".format(family_id))

    if not families:
        logger.warning(
            "Please provide at least one family with affected individuals")
        context.abort()
    # The individuals in the ped file must be present in the variant file:
    logger.info("Families used in analysis: {0}".format(','.join(
        list(families.keys()))))
    logger.info("Individuals included in analysis: {0}".format(','.join(
        list(family_parser.individuals.keys()))))

    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()
        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    #Add the first variant to the iterator
    if not line.startswith('#'):
        variant_file = itertools.chain([line], variant_file)
    else:
        print_headers(head=head, outfile=outfile, silent=silent)
        sys.exit(0)

    if vep:
        if not "CSQ" in head.info_dict:
            logger.warning(
                "vep flag is used but there is no CSQ field specified in header"
            )
            logger.info("Please check VCF file")
            context.abort()
        else:
            logger.info("Using VEP annotation")
    else:
        if not keyword in head.info_dict:
            logger.warning(
                "Annotation key {0} could not be found in VCF header".format(
                    keyword))
            logger.info("Please check VCF file")
            context.abort()
        else:
            logger.info("Using {0} annotation".format(keyword))

    if "GeneticModels" in head.info_dict:
        logger.warning("Genetic models are already annotated according to vcf"\
        " header.")
        context.abort()

    logger.info("Adding genmod version to vcf header")
    head.add_version_tracking(info_id='genmod',
                              version=__version__,
                              date=datetime.now().strftime("%Y-%m-%d %H:%M"),
                              command_line=' '.join(argument_list))

    logger.debug("Version added")
    logger.info("Adding genetic models to vcf header")
    add_metadata(
        head,
        'info',
        'GeneticModels',
        annotation_number='.',
        entry_type='String',
        description="':'-separated list of genetic models for this variant.")

    logger.debug("Genetic models added")
    logger.info("Adding model score to vcf header")
    add_metadata(head,
                 'info',
                 'ModelScore',
                 annotation_number='.',
                 entry_type='String',
                 description="PHRED score for genotype models.")
    logger.debug("Model score added")

    logger.info("Adding Compounds to vcf header")
    add_metadata(
        head,
        'info',
        'Compounds',
        annotation_number='.',
        entry_type='String',
        description=(
            "List of compound pairs for this variant."
            "The list is splitted on ',' family id is separated with compounds"
            "with ':'. Compounds are separated with '|'."))
    logger.debug("Compounds added")

    vcf_individuals = head.individuals
    logger.debug("Individuals found in vcf file: {}".format(
        ', '.join(vcf_individuals)))

    try:
        check_individuals(family_parser.individuals, vcf_individuals)
    except IOError as e:
        logger.error(e)
        logger.info("Individuals in PED file: {0}".format(', '.join(
            family_parser.individuals)))
        logger.info("Individuals in VCF file: {0}".format(
            ', '.join(vcf_individuals)))

        context.abort()

    start_time_analysis = datetime.now()

    analysis_individuals = list(family_parser.individuals.keys())

    logger.info("Individuals used in analysis: {0}".format(
        ', '.join(analysis_individuals)))

    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    # One batch consists of all variants from one or several overlapping genes
    # there can be a significant amount of variants in a batch for whole genome
    # data...
    variant_queue = JoinableQueue(maxsize=100)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_model_checkers = processes
    #Adapt the number of processes to the machine that run the analysis
    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_model_checkers))

    # These are the workers that do the heavy part of the analysis
    logger.info('Seting up the workers')
    try:
        model_checkers = [
            VariantAnnotator(task_queue=variant_queue,
                             results_queue=results,
                             families=families,
                             individuals=analysis_individuals,
                             phased=phased,
                             strict=strict,
                             vep=vep,
                             reduced_penetrance_genes=reduced_penetrance_genes)
            for i in range(num_model_checkers)
        ]
        logger.info('Starting the workers')
        for worker in model_checkers:
            logger.debug('Starting worker {0}'.format(worker))
            worker.start()

        # This process prints the variants to temporary files
        logger.info('Seting up the variant printer')
        if len(model_checkers) == 1:
            print_headers(head=head, outfile=outfile, silent=silent)
            variant_printer = VariantPrinter(task_queue=results,
                                             head=head,
                                             mode='normal',
                                             outfile=outfile)
        else:
            # We use a temp file to store the processed variants
            logger.debug("Build a tempfile for printing the variants")
            if temp_dir:
                temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
            else:
                temp_file = NamedTemporaryFile(delete=False)
            temp_file.close()

            variant_printer = VariantPrinter(task_queue=results,
                                             head=head,
                                             mode='chromosome',
                                             outfile=temp_file.name)

        logger.info('Starting the variant printer process')
        variant_printer.start()

        start_time_variant_parsing = datetime.now()

        # This process parses the original vcf and create batches to put in the variant queue:
        logger.info('Start parsing the variants')
        chromosome_list = get_batches(variants=variant_file,
                                      batch_queue=variant_queue,
                                      header=head,
                                      vep=vep,
                                      annotation_keyword=keyword)

        logger.debug("Put stop signs in the variant queue")
        for i in range(num_model_checkers):
            variant_queue.put(None)

        variant_queue.join()
        results.put(None)
        variant_printer.join()

        if len(model_checkers) > 1:
            sort_variants(infile=temp_file.name, mode='chromosome')

            print_headers(head=head, outfile=outfile, silent=silent)

            with open(temp_file.name, 'r', encoding='utf-8') as f:
                for line in f:
                    print_variant(variant_line=line,
                                  outfile=outfile,
                                  mode='modified',
                                  silent=silent)

    except Exception as err:
        logger.warning(err)
        for worker in model_checkers:
            worker.terminate()
        variant_printer.terminate()
        context.abort()
    finally:
        if len(model_checkers) > 1:
            logger.info("Removing temp file")
            os.remove(temp_file.name)
            logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Exemple #12
0
def filter(variant_file, annotation, threshold, discard, greater, silent,
           outfile):
    """
    Filter vcf variants.
    
    Filter variants based on their annotation
    """
    logger.info("Running genmod filter version {0}".format(__version__))
    variant_file = get_file_handle(variant_file)
    start_time_analysis = datetime.now()

    logger.info("Initializing a Header Parser")
    head = HeaderParser()

    for line in variant_file:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break

    #Add the first variant to the iterator
    variant_file = itertools.chain([line], variant_file)

    header_line = head.header

    if not annotation in head.info_dict:
        logger.warning(
            "Annotation {0} not specified in header".format(annotation))
        logger.info("Please check VCF file")
        logger.info("Exiting...")
        sys.exit(1)

    logger.info(
        "Building a plugin from extract_vcf for {0}".format(annotation))
    annotation_plugin = Plugin(name=annotation,
                               field='INFO',
                               info_key=annotation,
                               separators=[','],
                               record_rule='min',
                               data_type='float')
    logger.debug("Plugin=(field={0},info_key={1},separators={2},record_rule={3}"\
    ",data_type={4})".format('INFO', annotation, "','", 'min', 'float'))

    print_headers(head=head, outfile=outfile, silent=silent)

    nr_of_variants = 0
    nr_of_passed_variants = 0
    for variant in variant_file:
        nr_of_variants += 1
        keep_variant = False
        value = annotation_plugin.get_value(variant_line=variant)
        logger.debug("Found value {0}".format(value))
        if value:
            if greater:
                if value > threshold:
                    keep_variant = True
            else:
                if value < threshold:
                    keep_variant = True
        else:
            if not discard:
                keep_variant = True

        if keep_variant:
            logger.debug("Keeping variant")
            nr_of_passed_variants += 1
            print_variant(variant_line=variant,
                          outfile=outfile,
                          mode='vcf',
                          silent=silent)
        else:
            logger.debug("Discarding variant")

    logger.info("Number of variants in file {0}".format(nr_of_variants))
    logger.info(
        "Number of variants passing filter {0}".format(nr_of_passed_variants))
    logger.info(
        "Number of variants filtered {0}".format(nr_of_variants -
                                                 nr_of_passed_variants))
Exemple #13
0
def annotate(variant_file, annotate_regions, cadd_file, thousand_g, exac, 
spidex,annotation_dir, outfile, silent, cadd_raw, cosmic, max_af, processes,
temp_dir):
    """
    Annotate vcf variants.
    
    Annotate variants with a number of different sources.
    Please use --help for more info.
    """

    logger.info("Running genmod annotate_variant version {0}".format(__version__))
    
    start_time_analysis = datetime.now()
    annotator_arguments = {}
    
    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    line = None
    for line in variant_file:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    #Add the first variant to the iterator
    if line:
        variant_file = itertools.chain([line], variant_file)
    
    header_line = head.header
    annotator_arguments['header_line'] = header_line
    
    if annotate_regions:
        logger.info("Loading annotations")
        gene_trees, exon_trees = load_annotations(annotation_dir)
        annotator_arguments['gene_trees'] = gene_trees
        annotator_arguments['exon_trees'] = exon_trees
        
        add_metadata(
            head,
            'info',
            'Annotation',
            annotation_number='.',
            entry_type='String',
            description='Annotates what feature(s) this variant belongs to.'
        )
        add_metadata(
            head,
            'info',
            'Exonic',
            annotation_number='0',
            entry_type='Flag',
            description='Indicates if the variant is exonic.'
        )

    if exac:
        logger.info("Annotating ExAC frequencies")
        logger.debug("Using ExAC file: {0}".format(exac))
        annotator_arguments['exac'] = exac
        add_metadata(
            head,
            'info',
            'ExACAF',
            annotation_number='1',
            entry_type='Float',
            description="Frequency in the ExAC database."
        )
        
    if thousand_g:
        logger.info("Annotating 1000G frequencies")
        logger.debug("Using 1000G file: {0}".format(thousand_g))
        annotator_arguments['thousand_g'] = thousand_g
        add_metadata(
            head,
            'info',
            '1000GAF',
            annotation_number='1',
            entry_type='Float',
            description="Frequency in the 1000G database."
        )

    if spidex:
        logger.info("Annotating Spidex z scores")
        logger.debug("Using Spidex file: {0}".format(spidex))
        annotator_arguments['spidex'] = spidex
        add_metadata(
            head,
            'info',
            'SPIDEX',
            annotation_number='1',
            entry_type='Float',
            description="Z score from the spidex database."
        )
    
    if cadd_file:
        logger.info("Annotating CADD scores")
        logger.debug("Using CADD file(s): {0}".format(', '.join(cadd_file)))
        annotator_arguments['cadd_files'] = cadd_file
        any_cadd_file = True

        add_metadata(
            head,
            'info',
            'CADD',
            annotation_number='1',
            entry_type='Integer',
            description="The CADD relative score for this alternative."
        )
        if cadd_raw:
            annotator_arguments['cadd_raw'] = cadd_raw
            logger.debug("Adding vcf metadata for CADD raw score")
            add_metadata(
                head,
                'info',
                'CADD_raw',
                annotation_number='1',
                entry_type='Float',
                description="The CADD raw score(s) for this alternative(s)."
            )

    if max_af:
        annotator_arguments['max_af'] = max_af
        if thousand_g:
            add_metadata(
                head,
                'info',
                '1000G_MAX_AF',
                annotation_number='1',
                entry_type='Float',
                description="The max af for thousand genomes populations."
            )
        if exac:
            add_metadata(
                head,
                'info',
                'ExAC_MAX_AF',
                annotation_number='1',
                entry_type='Float',
                description="The max af for ExAC populations."
            )

    if cosmic:
        logger.info("Annotating if variant is in COSMIC")
        logger.debug("Using COSMOC file: {0}".format(cosmic))
        annotator_arguments['cosmic'] = cosmic
        add_metadata(
            head,
            'info',
            'COSMIC',
            annotation_number='0',
            entry_type='Flag',
            description="If variant is in COSMIC database."
        )
    
    ###################################################################
    ### The task queue is where all jobs(in this case batches that  ###
    ### represents variants in a region) is put. The consumers will ###
    ### then pick their jobs from this queue.                       ###
    ###################################################################

    logger.debug("Setting up a JoinableQueue for storing variant batches")
    variant_queue = JoinableQueue(maxsize=1000)
    logger.debug("Setting up a Queue for storing results from workers")
    results = Manager().Queue()

    num_annotators = processes
    #Adapt the number of processes to the machine that run the analysis
    if cadd_file or spidex:
        # We need more power when annotating cadd scores:
        # But if flag is used that overrides
        if num_annotators == min(4, cpu_count()):
            num_annotators = min(8, cpu_count())

    logger.info('Number of CPU:s {}'.format(cpu_count()))
    logger.info('Number of model checkers: {}'.format(num_annotators))


    # These are the workers that do the heavy part of the analysis
    logger.info('Setting up the workers')
    annotators = [
        VariantAnnotator(
            variant_queue, 
            results, 
            **annotator_arguments
        )
        for i in range(num_annotators)
    ]

    logger.info('Starting the workers')
    for worker in annotators:
        logger.debug('Starting worker {0}'.format(worker))
        worker.start()

    # This process prints the variants to temporary files
    # If there is only one annotation process we can print the results as soon
    # as they are done
    logger.info('Setting up the variant printer')
    if len(annotators) == 1:
        print_headers(head, outfile, silent)
        var_printer = VariantPrinter(
                        task_queue = results, 
                        head = head, 
                        mode='normal', 
                        outfile = outfile
                        )
    else:
        # We use a temp file to store the processed variants
        logger.debug("Build a tempfile for printing the variants")
        if temp_dir:
            temp_file = NamedTemporaryFile(delete=False, dir=temp_dir)
        else:
            temp_file = NamedTemporaryFile(delete=False)
            
        temp_file.close()
        
        var_printer = VariantPrinter(
                        task_queue = results, 
                        head = head, 
                        mode='chromosome', 
                        outfile = temp_file.name
                        )
    
    logger.info('Starting the variant printer process')
    var_printer.start()

    start_time_variant_parsing = datetime.now()
    start_time_twenty = datetime.now()
    nr_of_lines = 0
    # This process parses the original vcf and create batches to put in the variant queue:
    logger.info('Start parsing the variants')
    
    for line in variant_file:
        line = line.rstrip()
        
        if not line.startswith('#'):
            variant_queue.put(line)
            
            nr_of_lines += 1
            
            if nr_of_lines % 20000 == 0:
                logger.info('{0} variants parsed'.format(nr_of_lines))
                logger.info('Last 20000 took {0} to parse'.format(
                    datetime.now()-start_time_twenty))
                start_time_twenty = datetime.now()
    
    logger.info('Put stop signs in the variant queue')
    
    for i in range(num_annotators):
        variant_queue.put(None)

    variant_queue.join()
    results.put(None)
    var_printer.join()

    if len(annotators) > 1:
        logger.info("Start sorting the variants")
        sort_variants(temp_file.name, mode='chromosome')

        logger.info("Print the headers")
        print_headers(head, outfile, silent)

        with open(temp_file.name, 'r', encoding='utf-8') as f:
            for line in f:
                print_variant(
                    variant_line=line,
                    outfile=outfile,
                    mode='modified',
                    silent=silent
                )

        logger.info("Removing temp file")
        os.remove(temp_file.name)
        logger.debug("Temp file removed")

    logger.info('Time for whole analyis: {0}'.format(
        str(datetime.now() - start_time_analysis)))
Exemple #14
0
def filter(variant_file, annotation, threshold, discard, greater, silent, outfile):
    """
    Filter vcf variants.
    
    Filter variants based on their annotation
    """
    logger.info("Running genmod filter version {0}".format(__version__))
    
    start_time_analysis = datetime.now()
    
    logger.info("Initializing a Header Parser")
    head = HeaderParser()
    
    for line in variant_file:
        line = line.rstrip()

        if line.startswith('#'):
            if line.startswith('##'):
                head.parse_meta_data(line)
            else:
                head.parse_header_line(line)
        else:
            break
    
    #Add the first variant to the iterator
    variant_file = itertools.chain([line], variant_file)
    
    header_line = head.header
    
    if not annotation in head.info_dict:
        logger.warning("Annotation {0} not specified in header".format(annotation))
        logger.info("Please check VCF file")
        logger.info("Exiting...")
        sys.exit(1)
    
    logger.info("Building a plugin from extract_vcf for {0}".format(annotation))
    annotation_plugin = Plugin(
        name=annotation, 
        field='INFO',
        info_key=annotation,
        separators = [','],
        record_rule = 'min',
        data_type = 'float'
    )
    logger.debug("Plugin=(field={0},info_key={1},separators={2},record_rule={3}"\
    ",data_type={4})".format('INFO', annotation, "','", 'min', 'float'))
    
    print_headers(head=head, outfile=outfile, silent=silent)
    
    nr_of_variants = 0
    nr_of_passed_variants = 0
    for variant in variant_file:
        nr_of_variants += 1
        keep_variant = False
        value = annotation_plugin.get_value(variant_line=variant)
        logger.debug("Found value {0}".format(value))
        if value:
            if greater:
                if value > threshold:
                    keep_variant = True
            else:
                if value < threshold:
                    keep_variant = True
        else:
            if not discard:
                keep_variant = True
        
        if keep_variant:
            logger.debug("Keeping variant")
            nr_of_passed_variants += 1
            print_variant(
                variant_line=variant, 
                outfile=outfile, 
                mode='vcf', 
                silent=silent
            )
        else:
            logger.debug("Discarding variant")
            

    logger.info("Number of variants in file {0}".format(nr_of_variants))
    logger.info("Number of variants passing filter {0}".format(nr_of_passed_variants))
    logger.info("Number of variants filtered {0}".format(
        nr_of_variants - nr_of_passed_variants))