Example #1
0
def mk_hgvs(chrom, zero_based_start, transcript_name, ref, alt, use_gene=True):
    ''' Return HGVS descrption of VCF record '''
    t = get_transcript(transcript_name)
    # Do not generate HGVS names without transcripts
    if not t:
        return ''
    return hgvs.format_hgvs_name(chrom, zero_based_start+1, ref, alt, __GENOME__, t, use_gene=use_gene)
Example #2
0
def convertGenomicPosToTranscriptPos(genomicPos, chrom, genome, transcript):
    """
    Given a genomic position, chrom (in format "chrN"), genome (SequenceFileDB for genome),
      and transcript (pyhgvs transcript object):
    Returns a string of the transcript position at the given genomic position
    """
    # use "T" and "A" for ref and alt because transcript position is not dependent on these values
    # converts genomic position to transcript position
    hgvs_name = str(pyhgvs.format_hgvs_name(chrom, genomicPos, "T", "A", genome, transcript))
    # parses out transcript position from full hgvs_name
    transcriptPos = str(pyhgvs.HGVSName(hgvs_name).cdna_start)
    return transcriptPos
Example #3
0
def mk_hgvs(chrom, zero_based_start, transcript_name, ref, alt, use_gene=True):
    ''' Return HGVS descrption of VCF record '''
    t = get_transcript(transcript_name)
    # Do not generate HGVS names without transcripts
    if not t:
        return ''
    return hgvs.format_hgvs_name(chrom,
                                 zero_based_start + 1,
                                 ref,
                                 alt,
                                 __GENOME__,
                                 t,
                                 use_gene=use_gene)
Example #4
0
    def vcf_to_hgvs(self, reference_transcript, vcf_notation):
        """
        Converts a single VCF notation variant to HGVS notation relative to a given transcript.

        See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs.

        Args:
            reference_transcript (str): the refseq id of the reference transcript to use for HGVS notation
            vcf_notation (tuple of str): a tuple containing elements chromosome_number, coordinate, ref, and alt in that order

        Returns:
            str: hgvs notatation of variant in format reference_transcript:hgvs_description

        """

        chromosome_number, coordinate, ref, alt = vcf_notation
        coordinate = int(coordinate)

        transcript = self._get_transcript(reference_transcript)

        return pyhgvs.format_hgvs_name(chromosome_number, coordinate, ref, alt, self.genome, transcript)
Example #5
0
 def to_cDNA(self, chrom, offset, ref, alt, refseq_acc):
     """ convert to HGVS nomenclature """
     transcript = self.get_transcript(refseq_acc)
     
     if not chrom.startswith('chr'):
       chrom = 'chr%s'%chrom
       if chrom not in CHROMOSOMES:
         return None
       
     if not chrom in self.genome.keys():
       return None
       
     hgvs_name = pyhgvs.format_hgvs_name(chrom, offset, ref, alt, self.genome, transcript)
     if hgvs_name:
       itms = hgvs_name.split(':')
       if len(itms)>1:
         return itms[1]
       else:
         return hgvs_name
     else:
       return None
Example #6
0
    def vcf_to_hgvs(self, reference_transcript, vcf_notation):
        """
        Converts a single VCF notation variant to HGVS notation relative to a given transcript.

        See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs.

        Args:
            reference_transcript (str): the refseq id of the reference transcript to use for HGVS notation
            vcf_notation (tuple of str): a tuple containing elements chromosome_number, coordinate, ref, and alt in that order

        Returns:
            str: hgvs notatation of variant in format reference_transcript:hgvs_description

        """

        chromosome_number, coordinate, ref, alt = vcf_notation
        coordinate = int(coordinate)

        transcript = self._get_transcript(reference_transcript)

        return pyhgvs.format_hgvs_name(chromosome_number, coordinate, ref, alt,
                                       self.genome, transcript)
Example #7
0
    return transcripts.get(name)


# Parse the HGVS name into genomic coordinates and alleles.
chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G',
                                               genome,
                                               get_transcript=get_transcript)
print(chrom, offset, ref, alt)
# Returns variant in VCF style: ('chr11', 17496508, 'T', 'C')
# Notice that since the transcript is on the negative strand, the alleles
# are reverse complemented during conversion.

# Format an HGVS name.
chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C')
transcript = get_transcript('NM_000352.3')
hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript)
print(hgvs_name)
# Returns 'NM_000352.3(ABCC8):c.215A>G'

hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G')
# fields of the HGVS name are available as attributes:
#
# hgvs_name.transcript = 'NM_000352.3'
# hgvs_name.kind = 'c'
# hgvs_name.mutation_type = '>'
# hgvs_name.cdna_start = hgvs.CDNACoord(215, -10)
# hgvs_name.cdna_end = hgvs.CDNACoord(215, -10)
# hgvs_name.ref_allele = 'A'
# hgvs_name.alt_allele = 'G'

print((hgvs_name.transcript, hgvs_name.kind, hgvs_name.mutation_type,
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path,
                        filemode="w",
                        level=logging.DEBUG)

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = [
        "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
        "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
        "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"
    ]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:

        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(
            ',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" %
                  (line))
            continue
        cdna_coord = str(
            pyhgvs.format_hgvs_name("chr" + chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(
                cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
            if line[geneSymbolIndex] == 'BRCA1':
                for transcriptName in refSeqBRCA1Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)
            elif line[geneSymbolIndex] == 'BRCA2':
                for transcriptName in refSeqBRCA2Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)

        if calcProtein == True:

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print('hgvs.exceptions.HGVSParseError: ', e)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

            # Catch parse errors thrown by ometa.runtime.ParseError.
            except ParseError as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print(message)
                print('ometa.runtime.ParseError', ex)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom36, offset36, ref36, alt36)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom37, offset37, ref37, alt37)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom38, offset38, ref38, alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(
            int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(
            int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(
                str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG)

    hgvs_parser = hgvs.parser.Parser()
    hgvs_dp = hgvs.dataproviders.uta.connect()
    hgvs_norm = hgvs.normalizer.Normalizer(hgvs_dp)
    hgvs_am = hgvs.assemblymapper.AssemblyMapper(hgvs_dp, assembly_name='GRCh38')

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
                          "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
                          "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1']
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:
        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" % (line))
            continue
        cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            cdna_coord_LOVD = cdna_coord_LOVD.strip()

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            try:
                chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
                if line[geneSymbolIndex] == 'BRCA1':
                    for transcriptName in refSeqBRCA1Transcripts:
                        transcript38 = get_transcript38(transcriptName)
                        cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                        if cdna_synonym not in synonymString:
                            synonymString.append(cdna_synonym)
                elif line[geneSymbolIndex] == 'BRCA2':
                    for transcriptName in refSeqBRCA2Transcripts:
                        transcript38 = get_transcript38(transcriptName)
                        cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                        if cdna_synonym not in synonymString:
                            synonymString.append(cdna_synonym)
            except Exception as e:
                print('parse error: {}'.format(cdna_coord_LOVD))
                print(e)

        protein_coord = None
        if calcProtein:
            try:
                genomic_change = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)
                var_c1 = hgvs_parser.parse_hgvs_variant(cdna_coord)
                var_c1_norm = hgvs_norm.normalize(var_c1) # doing normalization explicitly to get a useful error message
                protein_coord = hgvs_am.c_to_p(var_c1_norm)
            except Exception as e:
                template = "An error of type {0} occured. Arguments:{1!r}"
                error_name = type(e).__name__
                message = template.format(error_name, e.args)
                logging.error(message)
                logging.error('Proposed GRCh38 Genomic change for error: %s', genomic_change)
                logging.error(line)

                # Exceptions related to invalid data
                data_errors = set(['HGVSParseError', 'HGVSError', 'HGVSInvalidVariantError', 'HGVSUnsupportedOperationError'])
                if error_name not in data_errors:
                    # output some more if exception doesn't seem to be related to invalid data
                    logging.error("Non data error raised")
                    logging.exception(message)

                if error_name == "DatabaseError":
                    # Aborting, as it is a transient error in principle, i.e. in one run we might be able to obtain a protein change, in another not, messing up the data diffs
                    raise EnvironmentError("Issue with UTA database. Aborting")

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
Example #10
0
def main(args):

    options = parse_args()

    hdp = hgvs.dataproviders.uta.connect()
    am38 = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38')
    hn = hgvs.normalizer.Normalizer(hdp)
    hp = hgvs.parser.Parser()
    # Read genome sequence using pyfaidx
    genome = Fasta(options.refFASTA)

    # Read RefSeq transcripts into a python dict.
    with open(options.refSEQ) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    # Provide a callback for fetching a transcript by its name.
    def get_transcript(name):
        return transcripts.get(name)

    babelfish38 = Babelfish(hdp, assembly_name="GRCh38")

    ## extract base variant representation
    with open(options.inVCF, 'rb') as in_vcf, open(options.outVCF,
                                                   'w') as out_vcf:
        vcf_reader = vcf.Reader(in_vcf)
        vcf_writer = vcf.Writer(out_vcf, vcf_reader)
        for record in vcf_reader:
            # Convert variants for indel HGVS representation
            chrom, offset, ref, alt = (str(record.CHROM), record.POS,
                                       str(record.REF), str(record.ALT[0]))
            print('chrom: {}, offset: {}, ref: {}, alt: {}'.format(
                chrom, offset, ref, alt))
            if 'chr13' in record.CHROM:
                transcript_id = "NM_000059.3"
            elif 'chr17' in record.CHROM:
                transcript_id = "NM_007294.4"
            transcript = get_transcript(transcript_id)
            try:
                hgvs_name = pyhgvs.format_hgvs_name(chrom,
                                                    offset,
                                                    ref,
                                                    alt,
                                                    genome,
                                                    transcript,
                                                    use_gene=False,
                                                    max_allele_length=50000)
                hgvs_c = hp.parse_hgvs_variant(hgvs_name)
                if len(ref) == len(alt) and len(ref) == 1:
                    # Variant is a SNP, normalize using hgvs Normalizer function
                    if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3'
                    norm_hgvs_c = hn.normalize(hgvs_c)
                    if 'chr17' in record.CHROM: norm_hgvs_c.ac = 'NM_007294.4'
                    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
                        str(norm_hgvs_c),
                        genome,
                        normalize=False,
                        get_transcript=get_transcript)
                else:
                    # Variant is an INDEL, normalize using hgvs babelfish38.hgvs_to_vcf function
                    if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3'
                    hgvs_g = am38.c_to_g(hgvs_c)
                    vcf_values = babelfish38.hgvs_to_vcf(hgvs_g)
                    chrom, offset, ref, alt = 'chr{}'.format(
                        vcf_values[0]
                    ), vcf_values[1], vcf_values[2], vcf_values[3]
            except hgvs.exceptions.HGVSUnsupportedOperationError as e:
                print(
                    'hgvs.exceptions.HGVSUnsupportedOperationError: {}'.format(
                        e))
            except hgvs.exceptions.HGVSInvalidIntervalError as e:
                print('hgvs.exceptions.HGVSInvalidIntervalError: {}'.format(e))
            except hgvs.exceptions.HGVSInvalidVariantError as e:
                print('hgvs.exceptions.HGVSInvalidVariantError: {}'.format(e))
            except AttributeError as e:
                print('AttributeError: {}'.format(e))
            except KeyError as e:
                print('KeyError: {}'.format(e))
            # Update and write the new normalized record
            record.POS = offset
            record.REF = ref
            record.ALT = [alt]
            vcf_writer.write_record(record)
Example #11
0
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsPColumnName = 'HGVS_Protein'

    labelLine = brcaFile.readline().rstrip().split('\t')
    writeLine = '\t'.join(labelLine) + '\n'
    outputFile.writelines(writeLine)

    # Store indexes of the relevant columns
    hgvsG36Index = labelLine.index(hgvsG36ColumnName)
    hgvsG37Index = labelLine.index(hgvsG37ColumnName)
    hgvsG38Index = labelLine.index(hgvsG38ColumnName)
    refSeqIndex = labelLine.index(refSeqColumnName)
    hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName)
    hgvsPIndex = labelLine.index(hgvsPColumnName)
    geneSymbolIndex = labelLine.index("Gene_Symbol")
    synonymIndex = labelLine.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in brcaFile:
        parsedLine = line.rstrip().split('\t')

        if parsedLine[geneSymbolIndex] == 'BRCA1':
            parsedLine[refSeqIndex] = 'NM_007294.3'
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            parsedLine[refSeqIndex] = 'NM_000059.3'

        # Format genomic variant position strings to contain relevant refseq strings
        oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG36Index]
        oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG37Index]
        oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG38Index].split(',')[0]
        oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex]

        chrom38 = oldHgvsGenomic38.split(':')[1]
        offset38 = oldHgvsGenomic38.split(':')[2]
        ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0]
        alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''

        transcript38 = get_transcript38(parsedLine[refSeqIndex])
        transcript37 = get_transcript37(parsedLine[refSeqIndex])
        transcript36 = get_transcript36(parsedLine[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        cdna_coord = str(
            pyhgvs.format_hgvs_name(chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))

        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        synonymString = []
        if parsedLine[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        if calcProtein == True:
            #print('oldHgvsGenomic38:', oldHgvsGenomic38)
            #print('oldHgvsCDNA: ', oldHgvsCDNA)
            #print('cdna: ', cdna_coord)

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                print('hgvs.exceptions.HGVSParseError: ', e)
                print(
                    'GRCh38 Genomic change: ',
                    '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38))
                print('')
            #print('oldProtein: ', parsedLine[hgvsPIndex])
            #print('protein:', protein_coord)
            #print('')

        # write new data into line
        parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format(
            chrom36, offset36, ref36, alt36)
        parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format(
            chrom37, offset37, ref37, alt37)
        parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format(
            chrom38, offset38, ref38, alt38)
        parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord))
        parsedLine[synonymIndex] = ','.join(synonymString)
        writeLine = '\t'.join(parsedLine) + '\n'
        outputFile.writelines(writeLine)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
    outputFile.close()
Example #12
0
    return transcripts.get(name)


# Parse the HGVS name into genomic coordinates and alleles.
chrom, offset, ref, alt = hgvs.parse_hgvs_name(
    'NM_000352.3:c.215A>G', genome, get_transcript=get_transcript)
print chrom, offset, ref, alt
# Returns variant in VCF style: ('chr11', 17496508, 'T', 'C')
# Notice that since the transcript is on the negative strand, the alleles
# are reverse complemented during conversion.


# Format an HGVS name.
chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C')
transcript = get_transcript('NM_000352.3')
hgvs_name = hgvs.format_hgvs_name(
    chrom, offset, ref, alt, genome, transcript)
print hgvs_name
# Returns 'NM_000352.3(ABCC8):c.215A>G'


hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G')
# fields of the HGVS name are available as attributes:
#
# hgvs_name.transcript = 'NM_000352.3'
# hgvs_name.kind = 'c'
# hgvs_name.mutation_type = '>'
# hgvs_name.cdna_start = hgvs.CDNACoord(215, -10)
# hgvs_name.cdna_end = hgvs.CDNACoord(215, -10)
# hgvs_name.ref_allele = 'A'
# hgvs_name.alt_allele = 'G'