def b37_hgvs_id(self): return myvariant.format_hgvs( self.get_chromosome_display(), self.pos, self.ref_allele, self.var_allele )
def getvariant(chromosome, start, ref, var): # Create myvariant info instance mv = myvariant.MyVariantInfo() # Get variant information for: chromosome, int(start), ref, var v = myvariant.format_hgvs(chromosome, int(start), ref, var) dir_ = mv.getvariant(v) # Return variant information found in all databases as a directory return dir_
def _parse_single_variant_record(cls, normed_headers_list, curr_line_fields_list, sample_names_list): # This code assumes that the VCF-produced format string and the genotype fields string(s) for the sample(s) # will be the last fields on every line and that they will NOT all have their own headers--rather, it # assumes the last header will indicate that the rest of the fields are "other info". Here is a simplified # example: # chr start end ref alt func_knowngene otherinfo # chrM 146 146 T C upstream;downstream 1 61.74 AC=2;AF=1.00;AN=2;DP=2;FS=0.000 GT:AD:DP:GQ:PL 1/1:0,22:22:66:794,66,0 ./.:0,0 1/1:0,40:40:99:1494,119,0 # Note that the content at and after the position of the otherinfo header may list additional information # before the format string and genotype fields info (which are required to be at the end of the line); # any such extra info is ignored. # make a dictionary that pairs every named header *except* the last one with its content in this line last_field_index = len(normed_headers_list) - 1 raw_fields_dict = dict( zip(normed_headers_list[0:last_field_index], curr_line_fields_list[0:last_field_index])) # TODO: someday: perhaps stop limiting this to only the fields in ANNOVAR_OUTPUT_COLS instead of all # For only a limited subset of columns, look those columns up in raw_fields_dict; if they hold real content, # do any clean-up necessary to their values and write them into a new dict cleaned_fields_dict = {} for curr_header in cls._ANNOVAR_OUTPUT_COLS: curr_value = raw_fields_dict[curr_header] if curr_value != ".": curr_value = cls._rewrite_value_if_special_header( curr_header, curr_value) cleaned_fields_dict[curr_header] = curr_value # generate the hgvs id for this variant hgvs_id = myvariant.format_hgvs(cleaned_fields_dict[cls.CHR_HEADER], cleaned_fields_dict[cls.START_HEADER], cleaned_fields_dict[cls.REF_HEADER], cleaned_fields_dict[cls.ALT_HEADER]) # now grab the number-of-samples-plus-one-th field from the *end* of the line--this holds the format # string--and also grab a list of the number-of-samples fields from the *end* of the line--these are # the genotype fields strings for each sample. num_samples_plus_one = len(sample_names_list) + 1 format_string = curr_line_fields_list[-num_samples_plus_one] genotype_field_strings_per_sample = curr_line_fields_list[ -len(sample_names_list)::] genotype_field_strings_by_sample_name = dict( zip(sample_names_list, genotype_field_strings_per_sample)) # turn the dictionary of annovar fields into a dictionary of annotations for the variant, including # nested structures containing sample-specific genotype-related info annotations_dict_for_curr_variant = AnnovarAnnotatedVariant.make_per_variant_annotation_dict( cleaned_fields_dict, hgvs_id, format_string, genotype_field_strings_by_sample_name) return hgvs_id, annotations_dict_for_curr_variant
def get_mv_data(chrom, pos, ref_allele, var_allele): hgvs_format = myvariant.format_hgvs(get_chrom_display(chrom), pos, ref_allele, var_allele) mv = myvariant.MyVariantInfo() mv_data = mv.getvariant(hgvs_format, fields=['clinvar', 'dbsnp', 'exac']) if mv_data and 'clinvar' in mv_data and 'rcv' in mv_data['clinvar']: if not type(mv_data['clinvar']['rcv']) == list: mv_data['clinvar']['rcv'] = [mv_data['clinvar']['rcv']] if mv_data: allele_freq, freq_url = get_allele_freq(mv_data, var_allele) else: allele_freq, freq_url = None, None return hgvs_format, mv_data, allele_freq, freq_url
def test_format_hgvs(self): self.assertEqual(myvariant.format_hgvs("1", 35366, "C", "T"), 'chr1:g.35366C>T') self.assertEqual(myvariant.format_hgvs("chr2", 17142, "G", "GA"), 'chr2:g.17142_17143insA') self.assertEqual(myvariant.format_hgvs("1", 10019, "TA", "T"), 'chr1:g.10020del') self.assertEqual(myvariant.format_hgvs("MT", 8270, "CACCCCCTCT", "C"), 'chrMT:g.8271_8279del') self.assertEqual(myvariant.format_hgvs("7", 15903, "G", "GC"), 'chr7:g.15903_15904insC') self.assertEqual(myvariant.format_hgvs("X", 107930849, "GGA", "C"), 'chrX:g.107930849_107930851delinsC') self.assertEqual(myvariant.format_hgvs("20", 1234567, "GTC", "GTCT"), 'chr20:g.1234569_1234570insT')
def _get_hgvs_ids_from_vcf(vcf_file_obj, chunk_index, chunk_size): reader = vcf.Reader(vcf_file_obj) hgvs_ids = [] for record in itertools.islice(reader, chunk_index * chunk_size, (chunk_index + 1) * chunk_size): hgvs_id = myvariant.format_hgvs(record.CHROM, record.POS, record.REF, str(record.ALT[0])) # ensure syntax consistency for chromosome M variants if AnnovarTxtParser.RAW_CHR_MT_SUFFIX_VAL in hgvs_id: one = hgvs_id.split(':')[0] two = hgvs_id.split(':')[1] if AnnovarTxtParser.STANDARDIZED_CHR_MT_SUFFIX_VAL not in one: one = AnnovarTxtParser.STANDARDIZED_CHR_MT_VAL hgvs_id = "".join([one, ':', two]) hgvs_ids.append(hgvs_id) return hgvs_ids
def match_genome(inputfile, outputfile, inputfilename): """ Produce a CSV genome report at outputfile for a given VCF inputfile. """ data = dict() # Set up ClinVar data. clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37') if clinvar_filepath.endswith('.vcf'): input_clinvar_file = open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvar_filepath) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") # Run vcf2clinvar on genome data. clinvar_matches = vcf2clinvar.match_to_clinvar( inputfile, input_clinvar_file) # Set up to get myvariant.info data (mainly for ExAC data.) mv = myvariant.MyVariantInfo() # iterate through all ClinVar matches. for genome_vcf_line, allele, zygosity in clinvar_matches: # Discard low quality data. if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters: continue # Check significance. Only keep this as a notable variant if one of the # submissions has reported "pathogenic" and "likely pathogenic" effect. sigs = [rec.sig for rec in allele.records] if not ('4' in sigs or '5' in sigs): continue # Store data in a dict according to HGVS position. poskey = myvariant.format_hgvs( genome_vcf_line.chrom, genome_vcf_line.start, genome_vcf_line.ref_allele, allele.sequence) data[poskey] = {'genome_vcf_line': genome_vcf_line, 'clinvar_allele': allele, 'zygosity': zygosity} # Add data from myvariant.info using the HGVS positions. variants = data.keys() mv_output = mv.getvariants(variants, fields=['clinvar', 'exac']) for i in range(len(variants)): if 'clinvar' in mv_output[i]: data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar'] if 'exac' in mv_output[i]: data[variants[i]]['mv_exac'] = mv_output[i]['exac'] # Write report as CSV. with open(outputfile, 'w') as f: csv_out = csv.writer(f) for var in variants: # Clinvar URL for variant. cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format( data[var]['clinvar_allele'].records[0].acc) disease_name = '' preferred_name = '' getev_url = '' # Disease name, preferred name, and GET-Evidence URL if we have # myvariant.info information with ClinVar data. if 'mv_clinvar' in data[var]: cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format( data[var]['mv_clinvar']['variant_id']) try: disease_name = data[var]['mv_clinvar']['rcv']['conditions']['name'] preferred_name = data[var]['mv_clinvar']['rcv']['preferred_name'] except TypeError: disease_name = ', '.join( set([rcv['conditions']['name'] for rcv in data[var]['mv_clinvar']['rcv']])) preferred_name = data[var]['mv_clinvar']['rcv'][0]['preferred_name'] getev_url = guess_getevidence_url(preferred_name) exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format( data[var]['genome_vcf_line'].chrom[3:], data[var]['genome_vcf_line'].start, data[var]['genome_vcf_line'].ref_allele, data[var]['clinvar_allele'].sequence) # Allele frequency using ExAC data, if myvariant.info had that. if 'mv_exac' in data[var]: total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[var]['mv_exac']['an']['an'] total_freq = str(total_freq) freq_source = 'ExAC' else: # If not, try to get it from our ClinVar data. try: total_freq = str(data[var]['clinvar_allele'].frequency) freq_source = 'ClinVar' except KeyError: # If that fails, give up on frequency. total_freq = '' freq_source = 'Unknown' data_row = [ inputfilename, var, preferred_name, disease_name, cv_url, exac_url, total_freq, freq_source, getev_url] csv_out.writerow(data_row) return
def _hgvs_id(chrom, pos, ref, var): return parse.unquote(myvariant.format_hgvs(chrom, pos, ref, var))
p.add_run('Time Processed ' + '\t' + '\t').bold = True p.add_run(str(currentDT.strftime("%I:%M:%S %p"))) # Start running list for variant counts processed = 0 clinical_count = 0 # Iterate through variant list and pull information for i, row in somatic_variants.iterrows(): processed += 1 chrom = row['Chromosome'] start = int(row['Start']) ref = row['Ref'] var = row['Var'] variant = myvariant.format_hgvs(chrom, start, ref, var) directory = mv.getvariant(variant) # Pull CIViC Data if directory: if 'civic' in directory: variant_descriptions = [] assertions = [] # Add count to clinical clinical_count += 1 # Pull general information for variant gene = directory['civic']['entrez_name'] ENST = directory['civic']['coordinates'][ 'representative_transcript'] protein_change = directory['civic']['name']
def match_genome(inputfile, outputfile, inputfilename): """ Produce a CSV genome report at outputfile for a given VCF inputfile. """ data = dict() # Set up ClinVar data. clinvar_filepath = clinvar_update.get_latest_vcf_file(FILESDIR, 'b37') if clinvar_filepath.endswith('.vcf'): input_clinvar_file = open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.gz'): input_clinvar_file = gzip.open(clinvar_filepath) elif clinvar_filepath.endswith('.vcf.bz2'): input_clinvar_file = bz2.BZ2File(clinvar_filepath) else: raise IOError("ClinVar filename expected to end with '.vcf'," + " '.vcf.gz', or '.vcf.bz2'.") # Run vcf2clinvar on genome data. clinvar_matches = vcf2clinvar.match_to_clinvar(inputfile, input_clinvar_file) # Set up to get myvariant.info data (mainly for ExAC data.) mv = myvariant.MyVariantInfo() # iterate through all ClinVar matches. for genome_vcf_line, allele, zygosity in clinvar_matches: # Discard low quality data. if genome_vcf_line.filters and 'PASS' not in genome_vcf_line.filters: continue # Check significance. Only keep this as a notable variant if one of the # submissions has reported "pathogenic" and "likely pathogenic" effect. sigs = [rec.sig for rec in allele.records] if not ('4' in sigs or '5' in sigs): continue # Store data in a dict according to HGVS position. poskey = myvariant.format_hgvs(genome_vcf_line.chrom, genome_vcf_line.start, genome_vcf_line.ref_allele, allele.sequence) data[poskey] = { 'genome_vcf_line': genome_vcf_line, 'clinvar_allele': allele, 'zygosity': zygosity } # Add data from myvariant.info using the HGVS positions. variants = data.keys() mv_output = mv.getvariants(variants, fields=['clinvar', 'exac']) for i in range(len(variants)): if 'clinvar' in mv_output[i]: data[variants[i]]['mv_clinvar'] = mv_output[i]['clinvar'] if 'exac' in mv_output[i]: data[variants[i]]['mv_exac'] = mv_output[i]['exac'] # Write report as CSV. with open(outputfile, 'w') as f: csv_out = csv.writer(f) for var in variants: # Clinvar URL for variant. cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/{}/'.format( data[var]['clinvar_allele'].records[0].acc) disease_name = '' preferred_name = '' getev_url = '' # Disease name, preferred name, and GET-Evidence URL if we have # myvariant.info information with ClinVar data. if 'mv_clinvar' in data[var]: cv_url = 'http://www.ncbi.nlm.nih.gov/clinvar/variation/{}/'.format( data[var]['mv_clinvar']['variant_id']) try: disease_name = data[var]['mv_clinvar']['rcv'][ 'conditions']['name'] preferred_name = data[var]['mv_clinvar']['rcv'][ 'preferred_name'] except TypeError: disease_name = ', '.join( set([ rcv['conditions']['name'] for rcv in data[var]['mv_clinvar']['rcv'] ])) preferred_name = data[var]['mv_clinvar']['rcv'][0][ 'preferred_name'] getev_url = guess_getevidence_url(preferred_name) exac_url = 'http://exac.broadinstitute.org/variant/{}-{}-{}-{}'.format( data[var]['genome_vcf_line'].chrom[3:], data[var]['genome_vcf_line'].start, data[var]['genome_vcf_line'].ref_allele, data[var]['clinvar_allele'].sequence) # Allele frequency using ExAC data, if myvariant.info had that. if 'mv_exac' in data[var]: total_freq = data[var]['mv_exac']['ac']['ac'] * 1.0 / data[ var]['mv_exac']['an']['an'] total_freq = str(total_freq) freq_source = 'ExAC' else: # If not, try to get it from our ClinVar data. try: total_freq = str(data[var]['clinvar_allele'].frequency) freq_source = 'ClinVar' except KeyError: # If that fails, give up on frequency. total_freq = '' freq_source = 'Unknown' data_row = [ inputfilename, var, preferred_name, disease_name, cv_url, exac_url, total_freq, freq_source, getev_url ] csv_out.writerow(data_row) return
def annotate_mutations(file, assembly='hg19'): # Open variant file with pandas identified_variants = pd.read_csv(file, sep='\t') # Give a name to the output file: 'test_filename' + '_AIM_report.pdf' doc_name = file.split('.')[0] + '_AIV_Report.pdf' # Create a sample document and sample style sheet report = SimpleDocTemplate(doc_name) style_ = getSampleStyleSheet() # Add a paragraph style to justify text style_.add(ParagraphStyle('Justified', alignment=TA_JUSTIFY)) # Create a list to store all the content which will be written into the report content = [] # Put main title of the annotation report content.append( Paragraph("Annotation of Identified Variants", style_['Heading1'])) # Add given input file name content.append( Paragraph('File Name: ' + '\t' + '\t' + '\t' + str(file) + '\n', style_['BodyText'])) # Get a myvariant info instance mv = myvariant.MyVariantInfo() # Initiliaze a counter for variants total_variants = 0 annotated_variants = 0 # Loop through identified variants and get annotations for i, row in identified_variants.iterrows(): # Store the total number of variants given in the input file total_variants += 1 # Get chromosome, start, reference and variant columns chrom_ = row['Chromosome'] start_ = row['Start'] ref_ = row['Ref'] var_ = row['Var'] # Get variant information v = myvariant.format_hgvs(chrom_, int(start_), ref_, var_) dir_ = mv.getvariant(v, assembly=assembly) # Get data from 'civic' if dir_: # Create an empty list to store annotations variant_annotations, gene_, protein_change_, info, evidence_items = _pull_data( dir_, 'civic') # Increase the number of clinically annotated variants by 1 (one). annotated_variants += 1 # Add content to the report: general info, annotations & evidence statements _add_variant_info(variant_annotations, annotated_variants, gene_, protein_change_, info, v, content, style_, evidence_items, assembly) # Add processing information: total processed variants and number of annotated variants _add_additional_info(total_variants, annotated_variants, content, style_) # Save report in the same directory report.build(content)
def annotate_mutations(file): # Open variant file with pandas identified_variants = pd.read_csv(file, sep='\t') doc_name = 'aim_report.pdf' report = SimpleDocTemplate(doc_name) style_ = getSampleStyleSheet() content = [] title = Paragraph("Annotation of Identified Variants", style_['Heading1']) content.append(title) p = Paragraph( 'Variant Call File Name: ' + '\t' + '\t' + '\t' + str(file) + '\n', style_['BodyText']) content.append(p) # Get a myvariant info instance mv = myvariant.MyVariantInfo() # Initiliaze a counter for variants total_variants = 0 annotated_variants = 0 # Loop through identified variants and get annotations for i, row in identified_variants.iterrows(): total_variants += 1 # Get chromosome, start, reference and variant columns chrom_ = row['Chromosome'] start_ = row['Start'] ref_ = row['Ref'] var_ = row['Var'] # Get variant information using 'myvariant' module v = myvariant.format_hgvs(chrom_, int(start_), ref_, var_) dir_ = mv.getvariant(v) # Get data if dir_ and 'civic' in dir_: # Create an empty list to store annotations variant_annotations = [] # Increase the number of clinically annotated variants by 1 (one). annotated_variants += 1 # Get information about variant gene_ = dir_['civic']['entrez_name'] protein_change_ = dir_['civic']['name'] # Get variant annotation for identified variant if 'description' in dir_['civic']: variant_annotations.append(dir_['civic']['description']) # Add info about variant to the report title = Paragraph('Clinical Variant: ' + str(annotated_variants), style_['Heading2']) content.append(title) p = Paragraph( 'Gene Name: ' + '\t' + '\t' + '\t' + str(gene_) + '\n', style_['BodyText']) content.append(p) p = Paragraph( 'Protein Change: ' + '\t' + '\t' + str(protein_change_) + '\n', style_['BodyText']) content.append(p) p = Paragraph('Coordinates: ' + '\t' + '\t' + '\t' + str(v) + '\n', style_['BodyText']) content.append(p) title = Paragraph('Variant Annotation: ', style_['Heading3']) content.append(title) # Add annotations to the report if len(variant_annotations): for annot in variant_annotations: p = Paragraph(str(annot), style_['BodyText']) content.append(p) else: p = Paragraph('Not found...' + '\n', style_['BodyText']) content.append(p) title = Paragraph('Additional information', style_['Heading3']) content.append(title) # Give additional information p = Paragraph( 'Total Number of Variants Processed: ' + str(total_variants) + '\n', style_['BodyText']) content.append(p) p = Paragraph( 'The Number of Clinical Annotations: ' + str(annotated_variants) + '\n', style_['BodyText']) content.append(p) # Save report report.build(content)