def load_annotations(annotation_dir, verbose=False): """ Load the annotations found in the indata path. These are pickled interval trees that are returned as dictionaries. """ gene_trees = {} exon_trees = {} if verbose: print('Reading annotations...\n', file=sys.stderr) gene_db = os.path.join(annotation_dir, 'genes.db') exon_db = os.path.join(annotation_dir, 'exons.db') try: with open(gene_db, 'rb') as f: gene_trees = pickle.load(f) with open(exon_db, 'rb') as g: exon_trees = pickle.load(g) except IOError as e: if verbose: warning('No annotations found.') warning('You need to build annotations! See documentation.') # It is possible to continue the analysis without annotation files pass if verbose: print('Annotations used found in: %s, %s\n' % (gene_db, exon_db), file=sys.stderr) return gene_trees, exon_trees
def annotate_variant(variant, gene_trees, exon_trees, vep, whole_genes, verbosity): """ Annotate variants with what regions the belong. Adds 'annotation' = set(set, of, genes) and 'compound_candidate' = Boolean to variant dictionary. Variants are compound candidates is the are exonic and in the same gene. If 'while_gene' is used intronic variants are also compound candidates. Input: variant_dictionary Returns: variant_dictionary with annotation added """ variant['comp_candidate'] = False variant['annotation'] = set() # Internally we never use 'chr' in the chromosome names: chrom = variant['CHROM'] if chrom.startswith('chr'): chrom = chrom[3:] alternatives = variant['ALT'].split(',') # When checking what features that are overlapped we use the longest alternative longest_alt = max([len(alternative) for alternative in alternatives]) variant_position = int(variant['POS']) variant_interval = [variant_position, (variant_position + longest_alt - 1)] #If annotated with vep we do not need to check interval trees if vep: variant['annotation'] = check_vep_annotation(variant) if len(variant['annotation']) > 0: variant['comp_candidate'] = True else: try: variant['annotation'] = set( gene_trees[chrom].find_range(variant_interval)) except KeyError: if verbosity: warning(''.join( ['Chromosome ', chrom, ' is not in annotation file!'])) if whole_genes: # If compounds are to be checked in whole genes (including introns): if len(variant['annotation']) > 0: variant['comp_candidate'] = True else: #Check if exonic: try: if len(exon_trees[chrom].find_range(variant_interval)): variant['comp_candidate'] = True except KeyError: if verbosity: warning(''.join( ['Chromosome ', chrom, ' is not in annotation file!'])) return
def run(self): """Starts the printing""" # Print the results to a temporary file: number_of_finished = 0 proc_name = self.name if self.verbosity: print(('%s: starting!' % proc_name), file=sys.stderr) while True: next_result = self.task_queue.get() if self.verbosity: if self.task_queue.full(): warning('Printing queue full') if next_result is None: if self.verbosity: print('All variants printed!', file=sys.stderr) self.temp_file.close() break else: for variant_id in next_result: variant = next_result[variant_id] if self.mode == 'score': try: priority = variant['Individual_rank_score'] except KeyError: priority = '0' elif self.mode == 'chromosome': chrom = variant['CHROM'] if chrom.startswith('chr'): chrom = chrom[3:] try: priority = int(chrom) except ValueError: if chrom == 'X': priority = 23 elif chrom == 'Y': priority = 24 elif chrom == 'MT': priority = 25 else: priority = 26 else: raise SyntaxError("""Need to specify priority mode for printing the variants""") print_line = [str(priority)] + [ variant.get(entry, '-') for entry in self.header ] self.temp_file.write('\t'.join(print_line) + '\n') return
def annotate_variant(variant, gene_trees, exon_trees, vep, whole_genes, verbosity): """ Annotate variants with what regions the belong. Adds 'annotation' = set(set, of, genes) and 'compound_candidate' = Boolean to variant dictionary. Variants are compound candidates is the are exonic and in the same gene. If 'while_gene' is used intronic variants are also compound candidates. Input: variant_dictionary Returns: variant_dictionary with annotation added """ variant['comp_candidate'] = False variant['annotation'] = set() # Internally we never use 'chr' in the chromosome names: chrom = variant['CHROM'] if chrom.startswith('chr'): chrom = chrom[3:] alternatives = variant['ALT'].split(',') # When checking what features that are overlapped we use the longest alternative longest_alt = max([len(alternative) for alternative in alternatives]) variant_position = int(variant['POS']) variant_interval = [variant_position, (variant_position + longest_alt-1)] #If annotated with vep we do not need to check interval trees if vep: variant['annotation'] = check_vep_annotation(variant) if len(variant['annotation']) > 0: variant['comp_candidate'] = True else: try: variant['annotation'] = set(gene_trees[chrom].find_range(variant_interval)) except KeyError: if verbosity: warning(''.join(['Chromosome ', chrom, ' is not in annotation file!'])) if whole_genes: # If compounds are to be checked in whole genes (including introns): if len(variant['annotation']) > 0: variant['comp_candidate'] = True else: #Check if exonic: try: if len(exon_trees[chrom].find_range(variant_interval)): variant['comp_candidate'] = True except KeyError: if verbosity: warning(''.join(['Chromosome ', chrom, ' is not in annotation file!'])) return
def get_batches(variant_parser, batch_queue, individuals, gene_trees={}, exon_trees={}, phased=False, vep=False, whole_genes=False, verbosity=False): """ Create batches and put them into the queue. Annotate the variants with regions, either from the annotation built by genmod or check the VEP terms. The variants in one feature will be a batch(default feature is a gene), if intergenic the batch sixe is maximun 10000 variants long. After one batch is filled it is sent to the variant queue. """ beginning = True # A batch is a dictionary with variants batch = {} new_chrom = None current_chrom = None current_features = [] haploblock_id = 1 # Haploblocks is a dictionary with list of lists like {ind_id:[[start, stop, id],[start, stop,id],...], ...} haploblocks = {ind_id: [] for ind_id in individuals} nr_of_batches = 0 chromosomes = [] # Parse the vcf file: if verbosity: start_parsing_time = datetime.now() start_chrom_time = start_parsing_time start_twenty_time = start_parsing_time if batch_queue.full(): warning('Queue full!!') nr_of_variants = 0 for variant in variant_parser: variant_id = variant['variant_id'] nr_of_variants += 1 new_chrom = variant['CHROM'] if new_chrom.startswith('chr'): new_chrom = new_chrom[3:] # Annotate which features the variant belongs to: annotate_variant(variant, gene_trees, exon_trees, vep, whole_genes, verbosity) new_features = variant['annotation'] if verbosity: if nr_of_variants % 20000 == 0: log.info('%s variants parsed!' % nr_of_variants) log.info('Last 20.000 took %s to parse.\n' % str(datetime.now() - start_twenty_time)) start_twenty_time = datetime.now() # If we look at the first variant, setup boundary conditions: if beginning: current_features = new_features # Add the variant to each of its features in a batch batch[variant_id] = variant current_chrom = new_chrom batch['haploblocks'] = {} if phased: # We collect the starts of the haploblocks haploblock_starts = { ind_id: int(variant['POS']) for ind_id in individuals } beginning = False else: # If we should put the batch in the queue: send = True if phased: for ind_id in individuals: #A new haploblock is indicated by '/' if the data is phased if '/' in variant.get(ind_id, './.'): #If call is not passed we consider it to be on same haploblock(GATK recommendations) if variant.get('FILTER', '.') == 'PASS': haploblocks[ind_id].append([ haploblock_starts[ind_id], int(variant['POS']) - 1, str(haploblock_id) ]) haploblock_id += 1 haploblock_starts[ind_id] = int(variant['POS']) # Check if we are in a space between features: if len(new_features) == 0: if len(current_features) == 0: # If the intergeneic region is bigger than 10000 we send it as a batch if len(batch) < 10000: send = False #If not check if we are in a region with overlapping features elif new_features.intersection(current_features): send = False # If we are at a new chromosome we finish the current batch: if new_chrom != current_chrom: chromosomes.append(current_chrom) # New chromosome means new batch send = True if verbosity: log.info('Chromosome %s parsed!' % current_chrom) log.info('Time to parse chromosome %s' % str(datetime.now() - start_chrom_time)) start_chrom_time = datetime.now() current_chrom = new_chrom if send: if phased: # Create an interval tree for each individual with the phaing intervals for ind_id in individuals: #Check if we have just finished an interval if haploblock_starts[ind_id] != int(variant['POS']): haploblocks[ind_id].append([ haploblock_starts[ind_id], int(variant['POS']), str(haploblock_id) ]) haploblock_id += 1 # Create interval trees of the haploblocks batch['haploblocks'][ ind_id] = interval_tree.IntervalTree( haploblocks[ind_id], haploblocks[ind_id][0][0] - 1, haploblocks[ind_id][-1][1] + 1) haploblocks = {ind_id: [] for ind_id in individuals} # Put the job in the queue batch_queue.put(batch) nr_of_batches += 1 #Reset the variables current_features = new_features batch = {} batch[variant_id] = variant batch['haploblocks'] = {} else: current_features = current_features.union(new_features) batch[variant_id] = variant chromosomes.append(current_chrom) nr_of_batches += 1 if verbosity: log.info('Chromosome %s parsed!' % current_chrom) log.info('Time to parse chromosome %s \n' % str(datetime.now() - start_chrom_time)) log.info('Variants parsed!') log.info('Time to parse variants:%s' % str(datetime.now() - start_parsing_time)) log.info('Number of variants in variant file:%s\n' % nr_of_variants) log.info('Number of batches created:%s\n' % nr_of_batches) if phased: # Create an interval tree for each individual with the phasing intervals for ind_id in individuals: #check if we have just finished an interval if haploblock_starts[ind_id] != int(variant['POS']): haploblocks[ind_id].append([ haploblock_starts[ind_id], int(variant['POS']), str(haploblock_id) ]) haploblock_id += 1 try: batch['haploblocks'][ind_id] = interval_tree.IntervalTree( haploblocks[ind_id], haploblocks[ind_id][0][0] - 1, haploblocks[ind_id][-1][1] + 1) except IndexError: pass batch_queue.put(batch) return chromosomes
def get_batches(variant_parser, batch_queue, individuals, gene_trees={}, exon_trees={}, phased=False, vep=False, whole_genes=False, verbosity=False): """ Create batches and put them into the queue. Annotate the variants with regions, either from the annotation built by genmod or check the VEP terms. The variants in one feature will be a batch(default feature is a gene), if intergenic the batch sixe is maximun 10000 variants long. After one batch is filled it is sent to the variant queue. """ beginning = True # A batch is a dictionary with variants batch = {} new_chrom = None current_chrom = None current_features = [] haploblock_id = 1 # Haploblocks is a dictionary with list of lists like {ind_id:[[start, stop, id],[start, stop,id],...], ...} haploblocks = {ind_id:[] for ind_id in individuals} nr_of_batches = 0 chromosomes = [] # Parse the vcf file: if verbosity: start_parsing_time = datetime.now() start_chrom_time = start_parsing_time start_twenty_time = start_parsing_time if batch_queue.full(): warning('Queue full!!') nr_of_variants = 0 for variant in variant_parser: variant_id = variant['variant_id'] nr_of_variants += 1 new_chrom = variant['CHROM'] if new_chrom.startswith('chr'): new_chrom = new_chrom[3:] # Annotate which features the variant belongs to: annotate_variant( variant, gene_trees, exon_trees, vep, whole_genes, verbosity ) new_features = variant['annotation'] if verbosity: if nr_of_variants % 20000 == 0: log.info('%s variants parsed!' % nr_of_variants) log.info('Last 20.000 took %s to parse.\n' % str(datetime.now() - start_twenty_time)) start_twenty_time = datetime.now() # If we look at the first variant, setup boundary conditions: if beginning: current_features = new_features # Add the variant to each of its features in a batch batch[variant_id] = variant current_chrom = new_chrom batch['haploblocks'] = {} if phased: # We collect the starts of the haploblocks haploblock_starts = {ind_id:int(variant['POS']) for ind_id in individuals} beginning = False else: # If we should put the batch in the queue: send = True if phased: for ind_id in individuals: #A new haploblock is indicated by '/' if the data is phased if '/' in variant.get(ind_id, './.'): #If call is not passed we consider it to be on same haploblock(GATK recommendations) if variant.get('FILTER', '.') == 'PASS': haploblocks[ind_id].append( [ haploblock_starts[ind_id], int(variant['POS']) - 1, str(haploblock_id) ] ) haploblock_id += 1 haploblock_starts[ind_id] = int(variant['POS']) # Check if we are in a space between features: if len(new_features) == 0: if len(current_features) == 0: # If the intergeneic region is bigger than 10000 we send it as a batch if len(batch) < 10000: send = False #If not check if we are in a region with overlapping features elif new_features.intersection(current_features): send = False # If we are at a new chromosome we finish the current batch: if new_chrom != current_chrom: chromosomes.append(current_chrom) # New chromosome means new batch send = True if verbosity: log.info('Chromosome %s parsed!' % current_chrom) log.info('Time to parse chromosome %s' % str(datetime.now()-start_chrom_time)) start_chrom_time = datetime.now() current_chrom = new_chrom if send: if phased: # Create an interval tree for each individual with the phaing intervals for ind_id in individuals: #Check if we have just finished an interval if haploblock_starts[ind_id] != int(variant['POS']): haploblocks[ind_id].append( [ haploblock_starts[ind_id], int(variant['POS']), str(haploblock_id) ] ) haploblock_id += 1 # Create interval trees of the haploblocks batch['haploblocks'][ind_id] = interval_tree.IntervalTree( haploblocks[ind_id], haploblocks[ind_id][0][0]-1, haploblocks[ind_id][-1][1]+1 ) haploblocks = {ind_id:[] for ind_id in individuals} # Put the job in the queue batch_queue.put(batch) nr_of_batches += 1 #Reset the variables current_features = new_features batch = {} batch[variant_id] = variant batch['haploblocks'] = {} else: current_features = current_features.union(new_features) batch[variant_id] = variant chromosomes.append(current_chrom) nr_of_batches += 1 if verbosity: log.info('Chromosome %s parsed!' % current_chrom) log.info('Time to parse chromosome %s \n' % str(datetime.now()-start_chrom_time)) log.info('Variants parsed!') log.info('Time to parse variants:%s' % str(datetime.now() - start_parsing_time)) log.info('Number of variants in variant file:%s\n' % nr_of_variants) log.info('Number of batches created:%s\n' % nr_of_batches) if phased: # Create an interval tree for each individual with the phasing intervals for ind_id in individuals: #check if we have just finished an interval if haploblock_starts[ind_id] != int(variant['POS']): haploblocks[ind_id].append( [ haploblock_starts[ind_id], int(variant['POS']), str(haploblock_id) ] ) haploblock_id += 1 try: batch['haploblocks'][ind_id] = interval_tree.IntervalTree( haploblocks[ind_id], haploblocks[ind_id][0][0]-1, haploblocks[ind_id][-1][1]+1 ) except IndexError: pass batch_queue.put(batch) return chromosomes