def mk_hgvs(chrom, zero_based_start, transcript_name, ref, alt, use_gene=True): ''' Return HGVS descrption of VCF record ''' t = get_transcript(transcript_name) # Do not generate HGVS names without transcripts if not t: return '' return hgvs.format_hgvs_name(chrom, zero_based_start+1, ref, alt, __GENOME__, t, use_gene=use_gene)
def convertGenomicPosToTranscriptPos(genomicPos, chrom, genome, transcript): """ Given a genomic position, chrom (in format "chrN"), genome (SequenceFileDB for genome), and transcript (pyhgvs transcript object): Returns a string of the transcript position at the given genomic position """ # use "T" and "A" for ref and alt because transcript position is not dependent on these values # converts genomic position to transcript position hgvs_name = str(pyhgvs.format_hgvs_name(chrom, genomicPos, "T", "A", genome, transcript)) # parses out transcript position from full hgvs_name transcriptPos = str(pyhgvs.HGVSName(hgvs_name).cdna_start) return transcriptPos
def mk_hgvs(chrom, zero_based_start, transcript_name, ref, alt, use_gene=True): ''' Return HGVS descrption of VCF record ''' t = get_transcript(transcript_name) # Do not generate HGVS names without transcripts if not t: return '' return hgvs.format_hgvs_name(chrom, zero_based_start + 1, ref, alt, __GENOME__, t, use_gene=use_gene)
def vcf_to_hgvs(self, reference_transcript, vcf_notation): """ Converts a single VCF notation variant to HGVS notation relative to a given transcript. See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs. Args: reference_transcript (str): the refseq id of the reference transcript to use for HGVS notation vcf_notation (tuple of str): a tuple containing elements chromosome_number, coordinate, ref, and alt in that order Returns: str: hgvs notatation of variant in format reference_transcript:hgvs_description """ chromosome_number, coordinate, ref, alt = vcf_notation coordinate = int(coordinate) transcript = self._get_transcript(reference_transcript) return pyhgvs.format_hgvs_name(chromosome_number, coordinate, ref, alt, self.genome, transcript)
def to_cDNA(self, chrom, offset, ref, alt, refseq_acc): """ convert to HGVS nomenclature """ transcript = self.get_transcript(refseq_acc) if not chrom.startswith('chr'): chrom = 'chr%s'%chrom if chrom not in CHROMOSOMES: return None if not chrom in self.genome.keys(): return None hgvs_name = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, self.genome, transcript) if hgvs_name: itms = hgvs_name.split(':') if len(itms)>1: return itms[1] else: return hgvs_name else: return None
return transcripts.get(name) # Parse the HGVS name into genomic coordinates and alleles. chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G', genome, get_transcript=get_transcript) print(chrom, offset, ref, alt) # Returns variant in VCF style: ('chr11', 17496508, 'T', 'C') # Notice that since the transcript is on the negative strand, the alleles # are reverse complemented during conversion. # Format an HGVS name. chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C') transcript = get_transcript('NM_000352.3') hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript) print(hgvs_name) # Returns 'NM_000352.3(ABCC8):c.215A>G' hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G') # fields of the HGVS name are available as attributes: # # hgvs_name.transcript = 'NM_000352.3' # hgvs_name.kind = 'c' # hgvs_name.mutation_type = '>' # hgvs_name.cdna_start = hgvs.CDNACoord(215, -10) # hgvs_name.cdna_end = hgvs.CDNACoord(215, -10) # hgvs_name.ref_allele = 'A' # hgvs_name.alt_allele = 'G' print((hgvs_name.transcript, hgvs_name.kind, hgvs_name.mutation_type,
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = [ "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein" ] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split( ',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str( pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name( cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) if calcProtein == True: try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print('hgvs.exceptions.HGVSParseError: ', e) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Catch parse errors thrown by ometa.runtime.ParseError. except ParseError as ex: template = "An exception of type {0} occured. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) genomicChange = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) print(message) print('ometa.runtime.ParseError', ex) print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38) print('GRCh38 Genomic change: ', genomicChange) logging.error(message) logging.error(line) logging.error('Proposed GRCh38 Genomic change for error: %s', genomicChange) # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index( "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) line[output_header_row.index( "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str( int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str( int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format( str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein artifacts_dir = options.artifacts_dir if not os.path.exists(artifacts_dir): os.makedirs(artifacts_dir) log_file_path = artifacts_dir + "brca-pseudonym-generator.log" logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG) hgvs_parser = hgvs.parser.Parser() hgvs_dp = hgvs.dataproviders.uta.connect() hgvs_norm = hgvs.normalizer.Normalizer(hgvs_dp) hgvs_am = hgvs.assemblymapper.AssemblyMapper(hgvs_dp, assembly_name='GRCh38') genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD' hgvsPColumnName = 'HGVS_Protein' # Set up header for output file input_file = csv.reader(brcaFile, delimiter='\t') output_file = csv.writer(outputFile, delimiter='\t') input_header_row = input_file.next() # The following new columns will contain data generated by this file new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37", "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End", "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"] output_header_row = input_header_row + new_columns_to_append output_file.writerow(output_header_row) # Store indexes of the relevant columns hgvsG36Index = input_header_row.index(hgvsG36ColumnName) hgvsG37Index = input_header_row.index(hgvsG37ColumnName) hgvsG38Index = input_header_row.index(hgvsG38ColumnName) refSeqIndex = input_header_row.index(refSeqColumnName) hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName) hgvsPIndex = input_header_row.index(hgvsPColumnName) hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName) geneSymbolIndex = input_header_row.index("Gene_Symbol") synonymIndex = input_header_row.index("Synonyms") refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1'] refSeqBRCA2Transcripts = ['U43746.1'] for line in input_file: if line[geneSymbolIndex] == 'BRCA1': line[refSeqIndex] = 'NM_007294.3' elif line[geneSymbolIndex] == 'BRCA2': line[refSeqIndex] = 'NM_000059.3' # Store for reference and debugging oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0] chrom38 = line[input_header_row.index("Chr")] offset38 = line[input_header_row.index("Pos")] ref38 = line[input_header_row.index("Ref")] alt38 = line[input_header_row.index("Alt")] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(line[refSeqIndex]) transcript37 = get_transcript37(line[refSeqIndex]) transcript36 = get_transcript36(line[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format if transcript38 is None: print("ERROR: could not parse transcript38 for variant: %s \n" % (line)) continue cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string if line[synonymIndex] == "-": synonymString = [] elif line[synonymIndex] == "": synonymString = [] else: synonymString = line[synonymIndex].split(",") if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) # Add hgvs_cDNA values from LOVD to synonyms if not already present for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','): # Skip if blank if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "": continue cdna_coord_LOVD = cdna_coord_LOVD.strip() # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1] if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]: continue try: chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38) if line[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) elif line[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100)) if cdna_synonym not in synonymString: synonymString.append(cdna_synonym) except Exception as e: print('parse error: {}'.format(cdna_coord_LOVD)) print(e) protein_coord = None if calcProtein: try: genomic_change = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38) var_c1 = hgvs_parser.parse_hgvs_variant(cdna_coord) var_c1_norm = hgvs_norm.normalize(var_c1) # doing normalization explicitly to get a useful error message protein_coord = hgvs_am.c_to_p(var_c1_norm) except Exception as e: template = "An error of type {0} occured. Arguments:{1!r}" error_name = type(e).__name__ message = template.format(error_name, e.args) logging.error(message) logging.error('Proposed GRCh38 Genomic change for error: %s', genomic_change) logging.error(line) # Exceptions related to invalid data data_errors = set(['HGVSParseError', 'HGVSError', 'HGVSInvalidVariantError', 'HGVSUnsupportedOperationError']) if error_name not in data_errors: # output some more if exception doesn't seem to be related to invalid data logging.error("Non data error raised") logging.exception(message) if error_name == "DatabaseError": # Aborting, as it is a transient error in principle, i.e. in one run we might be able to obtain a protein change, in another not, messing up the data diffs raise EnvironmentError("Issue with UTA database. Aborting") # Add empty data for each new column to prepare for data insertion by index for i in range(len(new_columns_to_append)): line.append('-') line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36) line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37) line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38) line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37) line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1) line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36) line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1) line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord) if calcProtein == True: line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord)) line[synonymIndex] = ','.join(synonymString) output_file.writerow(line) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close()
def main(args): options = parse_args() hdp = hgvs.dataproviders.uta.connect() am38 = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38') hn = hgvs.normalizer.Normalizer(hdp) hp = hgvs.parser.Parser() # Read genome sequence using pyfaidx genome = Fasta(options.refFASTA) # Read RefSeq transcripts into a python dict. with open(options.refSEQ) as infile: transcripts = pyhgvs_utils.read_transcripts(infile) # Provide a callback for fetching a transcript by its name. def get_transcript(name): return transcripts.get(name) babelfish38 = Babelfish(hdp, assembly_name="GRCh38") ## extract base variant representation with open(options.inVCF, 'rb') as in_vcf, open(options.outVCF, 'w') as out_vcf: vcf_reader = vcf.Reader(in_vcf) vcf_writer = vcf.Writer(out_vcf, vcf_reader) for record in vcf_reader: # Convert variants for indel HGVS representation chrom, offset, ref, alt = (str(record.CHROM), record.POS, str(record.REF), str(record.ALT[0])) print('chrom: {}, offset: {}, ref: {}, alt: {}'.format( chrom, offset, ref, alt)) if 'chr13' in record.CHROM: transcript_id = "NM_000059.3" elif 'chr17' in record.CHROM: transcript_id = "NM_007294.4" transcript = get_transcript(transcript_id) try: hgvs_name = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript, use_gene=False, max_allele_length=50000) hgvs_c = hp.parse_hgvs_variant(hgvs_name) if len(ref) == len(alt) and len(ref) == 1: # Variant is a SNP, normalize using hgvs Normalizer function if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3' norm_hgvs_c = hn.normalize(hgvs_c) if 'chr17' in record.CHROM: norm_hgvs_c.ac = 'NM_007294.4' chrom, offset, ref, alt = pyhgvs.parse_hgvs_name( str(norm_hgvs_c), genome, normalize=False, get_transcript=get_transcript) else: # Variant is an INDEL, normalize using hgvs babelfish38.hgvs_to_vcf function if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3' hgvs_g = am38.c_to_g(hgvs_c) vcf_values = babelfish38.hgvs_to_vcf(hgvs_g) chrom, offset, ref, alt = 'chr{}'.format( vcf_values[0] ), vcf_values[1], vcf_values[2], vcf_values[3] except hgvs.exceptions.HGVSUnsupportedOperationError as e: print( 'hgvs.exceptions.HGVSUnsupportedOperationError: {}'.format( e)) except hgvs.exceptions.HGVSInvalidIntervalError as e: print('hgvs.exceptions.HGVSInvalidIntervalError: {}'.format(e)) except hgvs.exceptions.HGVSInvalidVariantError as e: print('hgvs.exceptions.HGVSInvalidVariantError: {}'.format(e)) except AttributeError as e: print('AttributeError: {}'.format(e)) except KeyError as e: print('KeyError: {}'.format(e)) # Update and write the new normalized record record.POS = offset record.REF = ref record.ALT = [alt] vcf_writer.write_record(record)
def main(args): options = parse_args() brcaFile = options.inBRCA hg18_fa = options.inHg18 hg19_fa = options.inHg19 hg38_fa = options.inHg38 refSeq18 = options.inRefSeq18 refSeq19 = options.inRefSeq19 refSeq38 = options.inRefSeq38 outputFile = options.outBRCA calcProtein = options.calcProtein hdp = hgvs_dataproviders_uta.connect() variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp) hgvsparser = hgvs_parser.Parser() genome36 = SequenceFileDB(hg18_fa.name) genome37 = SequenceFileDB(hg19_fa.name) genome38 = SequenceFileDB(hg38_fa.name) transcripts36 = pyhgvs_utils.read_transcripts(refSeq18) transcripts37 = pyhgvs_utils.read_transcripts(refSeq19) transcripts38 = pyhgvs_utils.read_transcripts(refSeq38) def get_transcript36(name): return transcripts36.get(name) def get_transcript37(name): return transcripts37.get(name) def get_transcript38(name): return transcripts38.get(name) hgvsG36ColumnName = 'Genomic_Coordinate_hg36' hgvsG37ColumnName = 'Genomic_Coordinate_hg37' hgvsG38ColumnName = 'Genomic_Coordinate_hg38' refSeqColumnName = 'Reference_Sequence' hgvsCDNAColumnName = 'HGVS_cDNA' hgvsPColumnName = 'HGVS_Protein' labelLine = brcaFile.readline().rstrip().split('\t') writeLine = '\t'.join(labelLine) + '\n' outputFile.writelines(writeLine) # Store indexes of the relevant columns hgvsG36Index = labelLine.index(hgvsG36ColumnName) hgvsG37Index = labelLine.index(hgvsG37ColumnName) hgvsG38Index = labelLine.index(hgvsG38ColumnName) refSeqIndex = labelLine.index(refSeqColumnName) hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName) hgvsPIndex = labelLine.index(hgvsPColumnName) geneSymbolIndex = labelLine.index("Gene_Symbol") synonymIndex = labelLine.index("Synonyms") refSeqBRCA1Transcripts = [ 'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1' ] refSeqBRCA2Transcripts = ['U43746.1'] for line in brcaFile: parsedLine = line.rstrip().split('\t') if parsedLine[geneSymbolIndex] == 'BRCA1': parsedLine[refSeqIndex] = 'NM_007294.3' elif parsedLine[geneSymbolIndex] == 'BRCA2': parsedLine[refSeqIndex] = 'NM_000059.3' # Format genomic variant position strings to contain relevant refseq strings oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG36Index] oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG37Index] oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[ hgvsG38Index].split(',')[0] oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex] chrom38 = oldHgvsGenomic38.split(':')[1] offset38 = oldHgvsGenomic38.split(':')[2] ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0] alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1] # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser if ref38 == '-': ref38 = '' if alt38 == '-': alt38 = '' if alt38 == 'None': alt38 = '' transcript38 = get_transcript38(parsedLine[refSeqIndex]) transcript37 = get_transcript37(parsedLine[refSeqIndex]) transcript36 = get_transcript36(parsedLine[refSeqIndex]) # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format cdna_coord = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name( cdna_coord, genome38, get_transcript=get_transcript38) chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name( cdna_coord, genome37, get_transcript=get_transcript37) chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name( cdna_coord, genome36, get_transcript=get_transcript36) # Generate transcript hgvs cdna synonym string synonymString = [] if parsedLine[geneSymbolIndex] == 'BRCA1': for transcriptName in refSeqBRCA1Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) elif parsedLine[geneSymbolIndex] == 'BRCA2': for transcriptName in refSeqBRCA2Transcripts: transcript38 = get_transcript38(transcriptName) cdna_synonym = str( pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100)) synonymString.append(cdna_synonym) if calcProtein == True: #print('oldHgvsGenomic38:', oldHgvsGenomic38) #print('oldHgvsCDNA: ', oldHgvsCDNA) #print('cdna: ', cdna_coord) try: var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord) protein_coord = variantmapper.c_to_p(var_c1) except hgvs.exceptions.HGVSParseError as e: print('hgvs.exceptions.HGVSParseError: ', e) print( 'GRCh38 Genomic change: ', '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)) print('') #print('oldProtein: ', parsedLine[hgvsPIndex]) #print('protein:', protein_coord) #print('') # write new data into line parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format( chrom36, offset36, ref36, alt36) parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format( chrom37, offset37, ref37, alt37) parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format( chrom38, offset38, ref38, alt38) parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord) if calcProtein == True: parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord)) parsedLine[synonymIndex] = ','.join(synonymString) writeLine = '\t'.join(parsedLine) + '\n' outputFile.writelines(writeLine) hg18_fa.close() hg19_fa.close() hg38_fa.close() refSeq18.close() refSeq19.close() refSeq38.close() outputFile.close()
return transcripts.get(name) # Parse the HGVS name into genomic coordinates and alleles. chrom, offset, ref, alt = hgvs.parse_hgvs_name( 'NM_000352.3:c.215A>G', genome, get_transcript=get_transcript) print chrom, offset, ref, alt # Returns variant in VCF style: ('chr11', 17496508, 'T', 'C') # Notice that since the transcript is on the negative strand, the alleles # are reverse complemented during conversion. # Format an HGVS name. chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C') transcript = get_transcript('NM_000352.3') hgvs_name = hgvs.format_hgvs_name( chrom, offset, ref, alt, genome, transcript) print hgvs_name # Returns 'NM_000352.3(ABCC8):c.215A>G' hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G') # fields of the HGVS name are available as attributes: # # hgvs_name.transcript = 'NM_000352.3' # hgvs_name.kind = 'c' # hgvs_name.mutation_type = '>' # hgvs_name.cdna_start = hgvs.CDNACoord(215, -10) # hgvs_name.cdna_end = hgvs.CDNACoord(215, -10) # hgvs_name.ref_allele = 'A' # hgvs_name.alt_allele = 'G'