def _get_puzzle_variant(self, gemini_variant, index): """Take a gemini variant and return a basic puzzle variant For the overview we only need limited variant information """ variant_dict = { 'CHROM':gemini_variant['chrom'].lstrip('chrCHR'), 'POS':str(gemini_variant['start']), 'ID':gemini_variant['rs_ids'], 'REF':gemini_variant['ref'], 'ALT':gemini_variant['alt'], 'QUAL':gemini_variant['qual'], 'FILTER':gemini_variant['filter'] } variant = Variant(**variant_dict) variant['index'] = index # Use the gemini id for fast search variant.update_variant_id(gemini_variant['variant_id']) #Add the most severe consequence variant['most_severe_consequence'] = gemini_variant['impact_so'] #Add the impact severity variant['impact_severity'] = gemini_variant['impact_severity'] max_freq = gemini_variant['max_aaf_all'] if max_freq: variant.set_max_freq(max_freq) #### Check the impact annotations #### if gemini_variant['cadd_scaled']: variant['cadd_score'] = gemini_variant['cadd_scaled'] return variant
def variant(): """Return a variant dictionary""" data = dict(CHROM='1', POS='100', ID='rs01', REF='A', ALT='T', QUAL='100', FILTER='PASS') variant = Variant(**data) return variant
def _format_variant(self, gemini_variant, individual_objs, index=0): """Make a puzzle variant from a gemini variant Args: gemini_variant (GeminiQueryRow): The gemini variant individual_objs (list(dict)): A list of Individuals index(int): The index of the variant Returns: variant (dict): A Variant object """ variant_dict = { 'CHROM':gemini_variant['chrom'].lstrip('chrCHR'), 'POS':str(gemini_variant['start']), 'ID':gemini_variant['rs_ids'], 'REF':gemini_variant['ref'], 'ALT':gemini_variant['alt'], 'QUAL':gemini_variant['qual'], 'FILTER':gemini_variant['filter'] } variant = Variant(**variant_dict) variant['index'] = index # Use the gemini id for fast search variant.update_variant_id(gemini_variant['variant_id']) # Update the individuals individual_genotypes = self._get_genotypes( gemini_variant=gemini_variant, individual_objs=individual_objs ) for individual in individual_genotypes: # Add the genotype calls to the variant variant.add_individual(individual) for transcript in self._get_transcripts(gemini_variant): variant.add_transcript(transcript) #Add the most severe consequence variant['most_severe_consequence'] = gemini_variant['impact_so'] for gene in self._get_genes(variant): variant.add_gene(gene) variant['start'] = int(variant_dict['POS']) if self.variant_type == 'sv': other_chrom = variant['CHROM'] # If we have a translocation: if ':' in variant_dict['ALT']: other_coordinates = variant_dict['ALT'].strip('ACGTN[]').split(':') other_chrom = other_coordinates[0].lstrip('chrCHR') other_position = other_coordinates[1] variant['stop'] = other_position #Set 'infinity' to length if translocation variant['sv_len'] = float('inf') variant['sv_type'] = 'BND' else: variant['stop'] = int(gemini_variant['end']) variant['sv_len'] = variant['stop'] - variant['start'] variant['sv_type'] = gemini_variant['sub_type'] variant['stop_chrom'] = other_chrom else: variant['stop'] = int(variant_dict['POS']) + \ (len(variant_dict['REF']) - len(variant_dict['ALT'])) variant['cytoband_start'] = get_cytoband_coord( chrom=variant['CHROM'], pos=variant['start']) if variant.get('stop_chrom'): variant['cytoband_stop'] = get_cytoband_coord( chrom=variant['stop_chrom'], pos=variant['stop']) #### Check the impact annotations #### if gemini_variant['cadd_scaled']: variant['cadd_score'] = gemini_variant['cadd_scaled'] # We use the prediction in text polyphen = gemini_variant['polyphen_pred'] if polyphen: variant.add_severity('Polyphen', polyphen) # We use the prediction in text sift = gemini_variant['sift_pred'] if sift: variant.add_severity('SIFT', sift) #### Check the frequencies #### thousand_g = gemini_variant['aaf_1kg_all'] if thousand_g: variant['thousand_g'] = float(thousand_g) variant.add_frequency(name='1000GAF', value=float(thousand_g)) exac = gemini_variant['aaf_exac_all'] if exac: variant.add_frequency(name='EXaC', value=float(exac)) esp = gemini_variant['aaf_esp_all'] if esp: variant.add_frequency(name='ESP', value=float(esp)) max_freq = gemini_variant['max_aaf_all'] if max_freq: variant.set_max_freq(max_freq) return variant
def _format_variants(self, variant, index, case_obj, add_all_info=False): """Return a Variant object Format variant make a variant that includes enough information for the variant view. If add_all_info then all transcripts will be parsed Args: variant (cython2.Variant): A variant object index (int): The index of the variant case_obj (puzzle.models.Case): A case object """ header_line = self.head.header # Get the individual ids for individuals in vcf file vcf_individuals = set([ind_id for ind_id in self.head.individuals]) #Create a info dict: info_dict = dict(variant.INFO) chrom = variant.CHROM if chrom.startswith('chr') or chrom.startswith('CHR'): chrom = chrom[3:] variant_obj = Variant( CHROM=chrom, POS=variant.POS, ID=variant.ID, REF=variant.REF, ALT=variant.ALT[0], QUAL=variant.QUAL, FILTER=variant.FILTER, ) variant_obj._set_variant_id() logger.debug("Creating a variant object of variant {0}".format( variant_obj.variant_id)) variant_obj.index = index logger.debug("Updating index to: {0}".format(index)) ########### Get the coordinates for the variant ############## variant_obj.start = variant.start variant_obj.stop = variant.end #SV variants needs to be handeled a bit different since the can be huge #it would take to much power to parse all vep/snpeff entrys for these. if self.variant_type == 'sv': variant_obj.stop = int(info_dict.get('END', variant_obj.POS)) self._add_sv_coordinates(variant_obj) variant_obj.sv_type = info_dict.get('SVTYPE') # Special for FindSV software: # SV specific tag for number of occurances occurances = info_dict.get('OCC') if occurances: logger.debug("Updating occurances to: {0}".format(occurances)) variant_obj['occurances'] = float(occurances) variant_obj.add_frequency('OCC', occurances) else: self._add_thousand_g(variant_obj, info_dict) self._add_cadd_score(variant_obj, info_dict) self._add_genetic_models(variant_obj, info_dict) self._add_transcripts(variant_obj, info_dict) self._add_exac(variant_obj, info_dict) self._add_hgnc_symbols(variant_obj) if add_all_info: self._add_genotype_calls(variant_obj, str(variant), case_obj) self._add_compounds(variant_obj, info_dict) self._add_gmaf(variant_obj, info_dict) self._add_genes(variant_obj) ##### Add consequences #### self._add_consequences(variant_obj, str(variant)) self._add_most_severe_consequence(variant_obj) self._add_impact_severity(variant_obj) self._add_rank_score(variant_obj, info_dict) variant_obj.set_max_freq() return variant_obj
def _formated_variants(self, raw_variants, case_obj): """Return variant objects Args: raw_variants (Iterable): An iterable with variant lines case_obj (puzzle.nodels.Case): A case object """ vcf_file_path = case_obj.variant_source logger.info("Parsing file {0}".format(vcf_file_path)) head = HeaderParser() handle = get_vcf_handle(infile=vcf_file_path) # Parse the header for line in handle: line = line.rstrip() if line.startswith("#"): if line.startswith("##"): head.parse_meta_data(line) else: head.parse_header_line(line) else: break handle.close() header_line = head.header # Get the individual ids for individuals in vcf file vcf_individuals = set([ind_id for ind_id in head.individuals]) variant_columns = ["CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER"] vep_header = head.vep_columns snpeff_header = head.snpeff_columns index = 0 for variant_line in raw_variants: if not variant_line.startswith("#"): index += 1 # Create a variant dict: variant_dict = get_variant_dict(variant_line=variant_line, header_line=header_line) variant_dict["CHROM"] = variant_dict["CHROM"].lstrip("chrCHR") # Crreate a info dict: info_dict = get_info_dict(info_line=variant_dict["INFO"]) # Check if vep annotation: vep_string = info_dict.get("CSQ") # Check if snpeff annotation: snpeff_string = info_dict.get("ANN") if vep_string: # Get the vep annotations vep_info = get_vep_info(vep_string=vep_string, vep_header=vep_header) elif snpeff_string: # Get the vep annotations snpeff_info = get_snpeff_info(snpeff_string=snpeff_string, snpeff_header=snpeff_header) variant = Variant(**{column: variant_dict.get(column, ".") for column in variant_columns}) logger.debug("Creating a variant object of variant {0}".format(variant.get("variant_id"))) variant["index"] = index logger.debug("Updating index to: {0}".format(index)) variant["start"] = int(variant_dict["POS"]) if self.variant_type == "sv": other_chrom = variant["CHROM"] # If we have a translocation: if ":" in variant_dict["ALT"] and not "<" in variant_dict["ALT"]: other_coordinates = variant_dict["ALT"].strip("ACGTN[]").split(":") other_chrom = other_coordinates[0].lstrip("chrCHR") other_position = other_coordinates[1] variant["stop"] = other_position # Set 'infinity' to length if translocation variant["sv_len"] = float("inf") else: variant["stop"] = int(info_dict.get("END", variant_dict["POS"])) variant["sv_len"] = variant["stop"] - variant["start"] variant["stop_chrom"] = other_chrom else: variant["stop"] = int(variant_dict["POS"]) + (len(variant_dict["REF"]) - len(variant_dict["ALT"])) variant["sv_type"] = info_dict.get("SVTYPE") variant["cytoband_start"] = get_cytoband_coord(chrom=variant["CHROM"], pos=variant["start"]) if variant.get("stop_chrom"): variant["cytoband_stop"] = get_cytoband_coord(chrom=variant["stop_chrom"], pos=variant["stop"]) # It would be easy to update these keys... thousand_g = info_dict.get("1000GAF") if thousand_g: logger.debug("Updating thousand_g to: {0}".format(thousand_g)) variant["thousand_g"] = float(thousand_g) variant.add_frequency("1000GAF", variant.get("thousand_g")) # SV specific tag for number of occurances occurances = info_dict.get("OCC") if occurances: logger.debug("Updating occurances to: {0}".format(occurances)) variant["occurances"] = float(occurances) variant.add_frequency("OCC", occurances) cadd_score = info_dict.get("CADD") if cadd_score: logger.debug("Updating cadd_score to: {0}".format(cadd_score)) variant["cadd_score"] = float(cadd_score) rank_score_entry = info_dict.get("RankScore") if rank_score_entry: for family_annotation in rank_score_entry.split(","): rank_score = family_annotation.split(":")[-1] logger.debug("Updating rank_score to: {0}".format(rank_score)) variant["rank_score"] = float(rank_score) genetic_models_entry = info_dict.get("GeneticModels") if genetic_models_entry: genetic_models = [] for family_annotation in genetic_models_entry.split(","): for genetic_model in family_annotation.split(":")[-1].split("|"): genetic_models.append(genetic_model) logger.debug("Updating rank_score to: {0}".format(rank_score)) variant["genetic_models"] = genetic_models # Add genotype calls: for individual in case_obj.individuals: sample_id = individual.ind_id if sample_id in vcf_individuals: raw_call = dict(zip(variant_dict["FORMAT"].split(":"), variant_dict[sample_id].split(":"))) variant.add_individual( Genotype( sample_id=sample_id, genotype=raw_call.get("GT", "./."), case_id=individual.case_name, phenotype=individual.phenotype, ref_depth=raw_call.get("AD", ",").split(",")[0], alt_depth=raw_call.get("AD", ",").split(",")[1], genotype_quality=raw_call.get("GQ", "."), depth=raw_call.get("DP", "."), supporting_evidence=raw_call.get("SU", "0"), pe_support=raw_call.get("PE", "0"), sr_support=raw_call.get("SR", "0"), ) ) # Add transcript information: gmaf = None if vep_string: for transcript_info in vep_info: transcript = self._get_vep_transcripts(transcript_info) gmaf_raw = transcript_info.get("GMAF") if gmaf_raw: gmaf = float(gmaf_raw.split(":")[-1]) variant.add_transcript(transcript) if gmaf: variant.add_frequency("GMAF", gmaf) if not variant.thousand_g: variant.thousand_g = gmaf elif snpeff_string: for transcript_info in snpeff_info: transcript = self._get_snpeff_transcripts(transcript_info) variant.add_transcript(transcript) variant["most_severe_consequence"] = get_most_severe_consequence(variant["transcripts"]) for gene in self._get_genes(variant): variant.add_gene(gene) self._add_compounds(variant=variant, info_dict=info_dict) yield variant
def _format_variant(self, gemini_variant, individual_objs, index=0, add_all_info=False): """Make a puzzle variant from a gemini variant Args: gemini_variant (GeminiQueryRow): The gemini variant individual_objs (list(dict)): A list of Individuals index(int): The index of the variant Returns: variant (dict): A Variant object """ chrom = gemini_variant['chrom'] if chrom.startswith('chr') or chrom.startswith('CHR'): chrom = chrom[3:] variant_dict = { 'CHROM':chrom, 'POS':str(gemini_variant['start']), 'ID':gemini_variant['rs_ids'], 'REF':gemini_variant['ref'], 'ALT':gemini_variant['alt'], 'QUAL':gemini_variant['qual'], 'FILTER':gemini_variant['filter'] } variant = Variant(**variant_dict) # Use the gemini id for fast search variant.update_variant_id(gemini_variant['variant_id']) logger.debug("Creating a variant object of variant {0}".format( variant.variant_id)) variant['index'] = index # Add the most severe consequence self._add_most_severe_consequence(variant, gemini_variant) #Add the impact severity self._add_impact_severity(variant, gemini_variant) ### POSITON ANNOATTIONS ### variant.start = int(gemini_variant['start']) variant.stop = int(gemini_variant['end']) #Add the sv specific coordinates if self.variant_type == 'sv': variant.sv_type = gemini_variant['sub_type'] variant.stop = int(gemini_variant['end']) self._add_sv_coordinates(variant) else: ### Consequence and region annotations #Add the transcript information self._add_transcripts(variant, gemini_variant) self._add_thousand_g(variant, gemini_variant) self._add_exac(variant, gemini_variant) self._add_gmaf(variant, gemini_variant) #### Check the impact annotations #### if gemini_variant['cadd_scaled']: variant.cadd_score = gemini_variant['cadd_scaled'] # We use the prediction in text polyphen = gemini_variant['polyphen_pred'] if polyphen: variant.add_severity('Polyphen', polyphen) # We use the prediction in text sift = gemini_variant['sift_pred'] if sift: variant.add_severity('SIFT', sift) #Add the genes based on the hgnc symbols self._add_hgnc_symbols(variant) if self.variant_type == 'snv': self._add_genes(variant) self._add_consequences(variant) ### GENOTYPE ANNOATTIONS ### #Get the genotype info if add_all_info: self._add_genotypes(variant, gemini_variant, individual_objs) if self.variant_type == 'sv': self._add_genes(variant) return variant
def _formated_variants(self, raw_variants, case_obj): """Return variant objects Args: raw_variants (Iterable): An iterable with variant lines case_obj (puzzle.nodels.Case): A case object """ vcf_file_path = case_obj.variant_source logger.info("Parsing file {0}".format(vcf_file_path)) head = HeaderParser() handle = get_vcf_handle(infile=vcf_file_path) # Parse the header for line in handle: line = line.rstrip() if line.startswith('#'): if line.startswith('##'): head.parse_meta_data(line) else: head.parse_header_line(line) else: break handle.close() header_line = head.header # Get the individual ids for individuals in vcf file vcf_individuals = set([ind_id for ind_id in head.individuals]) variant_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'] vep_header = head.vep_columns snpeff_header = head.snpeff_columns index = 0 for variant_line in raw_variants: if not variant_line.startswith('#'): index += 1 #Create a variant dict: variant_dict = get_variant_dict( variant_line = variant_line, header_line = header_line ) variant_dict['CHROM'] = variant_dict['CHROM'].lstrip('chrCHR') #Crreate a info dict: info_dict = get_info_dict( info_line = variant_dict['INFO'] ) #Check if vep annotation: vep_string = info_dict.get('CSQ') #Check if snpeff annotation: snpeff_string = info_dict.get('ANN') if vep_string: #Get the vep annotations vep_info = get_vep_info( vep_string = vep_string, vep_header = vep_header ) elif snpeff_string: #Get the vep annotations snpeff_info = get_snpeff_info( snpeff_string = snpeff_string, snpeff_header = snpeff_header ) variant = Variant( **{column: variant_dict.get(column, '.') for column in variant_columns} ) logger.debug("Creating a variant object of variant {0}".format( variant.get('variant_id'))) variant['index'] = index logger.debug("Updating index to: {0}".format( index)) variant['start'] = int(variant_dict['POS']) if self.variant_type == 'sv': other_chrom = variant['CHROM'] # If we have a translocation: if ':' in variant_dict['ALT']: other_coordinates = variant_dict['ALT'].strip('ACGTN[]').split(':') other_chrom = other_coordinates[0].lstrip('chrCHR') other_position = other_coordinates[1] variant['stop'] = other_position #Set 'infinity' to length if translocation variant['sv_len'] = float('inf') else: variant['stop'] = int(info_dict.get('END', variant_dict['POS'])) variant['sv_len'] = variant['stop'] - variant['start'] variant['stop_chrom'] = other_chrom else: variant['stop'] = int(variant_dict['POS']) + \ (len(variant_dict['REF']) - len(variant_dict['ALT'])) variant['sv_type'] = info_dict.get('SVTYPE') variant['cytoband_start'] = get_cytoband_coord( chrom=variant['CHROM'], pos=variant['start']) if variant.get('stop_chrom'): variant['cytoband_stop'] = get_cytoband_coord( chrom=variant['stop_chrom'], pos=variant['stop']) # It would be easy to update these keys... thousand_g = info_dict.get('1000GAF') if thousand_g: logger.debug("Updating thousand_g to: {0}".format( thousand_g)) variant['thousand_g'] = float(thousand_g) variant.add_frequency('1000GAF', variant.get('thousand_g')) #SV specific tag for number of occurances occurances = info_dict.get('OCC') if occurances: logger.debug("Updating occurances to: {0}".format( occurances)) variant['occurances'] = float(occurances) variant.add_frequency('OCC', occurances) cadd_score = info_dict.get('CADD') if cadd_score: logger.debug("Updating cadd_score to: {0}".format( cadd_score)) variant['cadd_score'] = float(cadd_score) rank_score_entry = info_dict.get('RankScore') if rank_score_entry: for family_annotation in rank_score_entry.split(','): rank_score = family_annotation.split(':')[-1] logger.debug("Updating rank_score to: {0}".format( rank_score)) variant['rank_score'] = float(rank_score) genetic_models_entry = info_dict.get('GeneticModels') if genetic_models_entry: genetic_models = [] for family_annotation in genetic_models_entry.split(','): for genetic_model in family_annotation.split(':')[-1].split('|'): genetic_models.append(genetic_model) logger.debug("Updating rank_score to: {0}".format( rank_score)) variant['genetic_models'] = genetic_models #Add genotype calls: for individual in case_obj.individuals: sample_id = individual.ind_id if sample_id in vcf_individuals: raw_call = dict(zip( variant_dict['FORMAT'].split(':'), variant_dict[sample_id].split(':')) ) variant.add_individual(Genotype( sample_id = sample_id, genotype = raw_call.get('GT', './.'), case_id = individual.case_name, phenotype = individual.phenotype, ref_depth = raw_call.get('AD', ',').split(',')[0], alt_depth = raw_call.get('AD', ',').split(',')[1], genotype_quality = raw_call.get('GQ', '.'), depth = raw_call.get('DP', '.'), supporting_evidence = raw_call.get('SU', '0'), pe_support = raw_call.get('PE', '0'), sr_support = raw_call.get('SR', '0'), )) # Add transcript information: if vep_string: for transcript in self._get_vep_transcripts(variant, vep_info): variant.add_transcript(transcript) elif snpeff_string: for transcript in self._get_snpeff_transcripts(variant, snpeff_info): variant.add_transcript(transcript) variant['most_severe_consequence'] = get_most_severe_consequence( variant['transcripts'] ) for gene in self._get_genes(variant): variant.add_gene(gene) self._add_compounds(variant=variant, info_dict=info_dict) yield variant
def _format_variant(self, case_id, gemini_variant, individual_objs, index=0, add_all_info=False): """Make a puzzle variant from a gemini variant Args: case_id (str): related case id gemini_variant (GeminiQueryRow): The gemini variant individual_objs (list(dict)): A list of Individuals index(int): The index of the variant Returns: variant (dict): A Variant object """ chrom = gemini_variant['chrom'] if chrom.startswith('chr') or chrom.startswith('CHR'): chrom = chrom[3:] variant_dict = { 'CHROM': chrom, 'POS': str(gemini_variant['start']), 'ID': gemini_variant['rs_ids'], 'REF': gemini_variant['ref'], 'ALT': gemini_variant['alt'], 'QUAL': gemini_variant['qual'], 'FILTER': gemini_variant['filter'] } variant = Variant(**variant_dict) # Use the gemini id for fast search variant.update_variant_id(gemini_variant['variant_id']) logger.debug("Creating a variant object of variant {0}".format( variant.variant_id)) variant['index'] = index # Add the most severe consequence self._add_most_severe_consequence(variant, gemini_variant) #Add the impact severity self._add_impact_severity(variant, gemini_variant) ### POSITON ANNOATTIONS ### variant.start = int(gemini_variant['start']) variant.stop = int(gemini_variant['end']) #Add the sv specific coordinates if self.variant_type == 'sv': variant.sv_type = gemini_variant['sub_type'] variant.stop = int(gemini_variant['end']) self._add_sv_coordinates(variant) else: ### Consequence and region annotations #Add the transcript information self._add_transcripts(variant, gemini_variant) self._add_thousand_g(variant, gemini_variant) self._add_exac(variant, gemini_variant) self._add_gmaf(variant, gemini_variant) #### Check the impact annotations #### if gemini_variant['cadd_scaled']: variant.cadd_score = gemini_variant['cadd_scaled'] # We use the prediction in text polyphen = gemini_variant['polyphen_pred'] if polyphen: variant.add_severity('Polyphen', polyphen) # We use the prediction in text sift = gemini_variant['sift_pred'] if sift: variant.add_severity('SIFT', sift) #Add the genes based on the hgnc symbols self._add_hgnc_symbols(variant) if self.variant_type == 'snv': self._add_genes(variant) self._add_consequences(variant) ### GENOTYPE ANNOATTIONS ### #Get the genotype info if add_all_info: self._add_genotypes(variant, gemini_variant, case_id, individual_objs) if self.variant_type == 'sv': self._add_genes(variant) return variant
def _format_variant(self, variant_line, index, case_obj, head): """Return variant objects Args: raw_variants (Iterable): An iterable with variant lines case_obj (puzzle.nodels.Case): A case object """ header_line = head.header # Get the individual ids for individuals in vcf file vcf_individuals = set([ind_id for ind_id in head.individuals]) variant_columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER'] vep_header = head.vep_columns snpeff_header = head.snpeff_columns #Create a variant dict: variant_dict = get_variant_dict( variant_line = variant_line, header_line = header_line ) variant_dict['CHROM'] = variant_dict['CHROM'].lstrip('chrCHR') #Crreate a info dict: info_dict = get_info_dict( info_line = variant_dict['INFO'] ) #Check if vep annotation: vep_string = info_dict.get('CSQ') #Check if snpeff annotation: snpeff_string = info_dict.get('ANN') if vep_string: #Get the vep annotations vep_info = get_vep_info( vep_string = vep_string, vep_header = vep_header ) elif snpeff_string: #Get the vep annotations snpeff_info = get_snpeff_info( snpeff_string = snpeff_string, snpeff_header = snpeff_header ) variant = Variant( **{column: variant_dict.get(column, '.') for column in variant_columns} ) logger.debug("Creating a variant object of variant {0}".format( variant.get('variant_id'))) variant['index'] = index logger.debug("Updating index to: {0}".format( index)) variant['start'] = int(variant_dict['POS']) if self.variant_type == 'sv': other_chrom = variant['CHROM'] # If we have a translocation: if ':' in variant_dict['ALT'] and not '<' in variant_dict['ALT']: other_coordinates = variant_dict['ALT'].strip('ACGTN[]').split(':') other_chrom = other_coordinates[0].lstrip('chrCHR') other_position = other_coordinates[1] variant['stop'] = other_position #Set 'infinity' to length if translocation variant['sv_len'] = float('inf') else: variant['stop'] = int(info_dict.get('END', variant_dict['POS'])) variant['sv_len'] = variant['stop'] - variant['start'] variant['stop_chrom'] = other_chrom else: variant['stop'] = int(variant_dict['POS']) + \ (len(variant_dict['REF']) - len(variant_dict['ALT'])) variant['sv_type'] = info_dict.get('SVTYPE') variant['cytoband_start'] = get_cytoband_coord( chrom=variant['CHROM'], pos=variant['start']) if variant.get('stop_chrom'): variant['cytoband_stop'] = get_cytoband_coord( chrom=variant['stop_chrom'], pos=variant['stop']) # It would be easy to update these keys... thousand_g = info_dict.get('1000GAF') if thousand_g: logger.debug("Updating thousand_g to: {0}".format( thousand_g)) variant['thousand_g'] = float(thousand_g) variant.add_frequency('1000GAF', variant.get('thousand_g')) #SV specific tag for number of occurances occurances = info_dict.get('OCC') if occurances: logger.debug("Updating occurances to: {0}".format( occurances)) variant['occurances'] = float(occurances) variant.add_frequency('OCC', occurances) cadd_score = info_dict.get('CADD') if cadd_score: logger.debug("Updating cadd_score to: {0}".format( cadd_score)) variant['cadd_score'] = float(cadd_score) rank_score_entry = info_dict.get('RankScore') if rank_score_entry: for family_annotation in rank_score_entry.split(','): rank_score = family_annotation.split(':')[-1] logger.debug("Updating rank_score to: {0}".format( rank_score)) variant['rank_score'] = float(rank_score) genetic_models_entry = info_dict.get('GeneticModels') if genetic_models_entry: genetic_models = [] for family_annotation in genetic_models_entry.split(','): for genetic_model in family_annotation.split(':')[-1].split('|'): genetic_models.append(genetic_model) logger.debug("Updating rank_score to: {0}".format( rank_score)) variant['genetic_models'] = genetic_models #Add genotype calls: for individual in case_obj.individuals: sample_id = individual.ind_id if sample_id in vcf_individuals: raw_call = dict(zip( variant_dict['FORMAT'].split(':'), variant_dict[sample_id].split(':')) ) genotype = Genotype(**raw_call) variant.add_individual(puzzle_genotype( sample_id = sample_id, genotype = genotype.genotype, case_id = individual.case_name, phenotype = individual.phenotype, ref_depth = genotype.ref_depth, alt_depth = genotype.alt_depth, genotype_quality = genotype.genotype_quality, depth = genotype.depth_of_coverage, supporting_evidence = genotype.supporting_evidence, pe_support = genotype.pe_support, sr_support = genotype.sr_support, )) # Add transcript information: gmaf = None if vep_string: for transcript_info in vep_info: transcript = self._get_vep_transcripts(transcript_info) gmaf_raw = transcript_info.get('GMAF') if gmaf_raw: gmaf = float(gmaf_raw.split(':')[-1]) variant.add_transcript(transcript) if gmaf: variant.add_frequency('GMAF', gmaf) if not variant.thousand_g: variant.thousand_g = gmaf elif snpeff_string: for transcript_info in snpeff_info: transcript = self._get_snpeff_transcripts(transcript_info) variant.add_transcript(transcript) most_severe_consequence = get_most_severe_consequence( variant['transcripts'] ) if most_severe_consequence: variant['most_severe_consequence'] = most_severe_consequence variant['impact_severity'] = IMPACT_SEVERITIES.get(most_severe_consequence) for gene in self._get_genes(variant): variant.add_gene(gene) self._add_compounds(variant=variant, info_dict=info_dict) return variant