def check_HGVS_conversion_error():
    df = pd.read_csv(ORIGINAL_FILE, sep="\t")
    n_diff = 0
    n_real_diff = 0
    for row in df.iterrows():
        original_genome_coor = [row[1].Chrom, row[1].Pos,
                                row[1].Ref.replace("-", ""),
                                row[1].Alt.replace("-", "")]
        try:
            chrm, pos, ref, alt = pyhgvs.parse_hgvs_name(
                row[1].HGVS, GENOME, get_transcript=get_transcript)
            chrm = chrm[3:]
            pos = str(pos)
            converted_genome_coor = [chrm, pos, ref, alt]
        except (pyhgvs.InvalidHGVSName, NotImplementedError, ValueError, AssertionError):
            converted_genome_coor = ["NA"]

        if converted_genome_coor == original_genome_coor:
            continue
        elif "NA" in converted_genome_coor or "None" in original_genome_coor:
            continue
        else:
            n_diff += 1
            if not string_comp.variant_equal(original_genome_coor, converted_genome_coor):
                n_real_diff += 1


    print "total number of rows in file: ", len(df)
    print "number of mismatching result by direct comparison: ", n_diff
    print "number of mismatching result by string comparison: ", n_real_diff
Beispiel #2
0
def translate(variant,transcripts,get_transcript):
    genome = SequenceFileDB('hg19.fa') #pip install bsddb3 is required
    try:
        chrom, offset, ref, alt = hgvs.parse_hgvs_name(variant, genome, get_transcript=get_transcript)
    except:
        return 1
    return chrom, offset, ref, alt
Beispiel #3
0
    def hgvs_to_vcf(self, hgvs_variant):
        """
        Converts a single variant provided in HGVS notation to genomic coordinate notation.

        See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs.

        Args:
            hgvs_variant (str): HGVS description of variant, such as NM_001100.3:c.137T>C.
                The portion prior to the colon is the refseqID used as the reference for the variant The portion after
                the colon is an HGVS-style description of the mutation (a SNP from T to C at location 137 in the example above.

        Returns:
            tuple of str: (chromosome_number, coordinate, ref, alt) in that order denoting the VCF notation of the variant

        """

        # Library requires string not unicode, ensure format is correct
        hgvs_variant = str(hgvs_variant)

        chromosome_number, coordinate, ref, alt = pyhgvs.parse_hgvs_name(
            hgvs_variant, self.genome, get_transcript=self._get_transcript)
        chromosome_number = re.match('chr(.+)', chromosome_number).group(1)
        coordinate = str(coordinate)

        return chromosome_number, coordinate, ref, alt
Beispiel #4
0
    def to_chrom_coordinate(self, cDNA):
        try:
            chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(cDNA, self.genome, \
                                       get_transcript = self.get_transcript)

            return chrom, offset, ref, alt
        except:
            print "[%s] cannot be coverted to chromosome coordinate" % cDNA
            return None, None, None, None
Beispiel #5
0
def parse_hgvs(hgvs_name, fasta, genes):
    genome = Fasta(fasta, key_function=lambda x: 'chr{}'.format(x))

    with open(genes) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)

    return hgvs.parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)
Beispiel #6
0
def get_genome_coor(hgvs_c):
    genome = SequenceFileDB('data/hg19.fa')
    refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)

    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        hgvs_c, genome, get_transcript=get_transcript)
    return chrom + ":" + str(offset) + ":" + ref + ">" + alt
def HGVS_to_GenomeCoor(HGVS):
    """use counsyl pyhgvs for this"""
    genome = SequenceFileDB('../data/hg19.fa')
    refGene = "../data/BRCA12.refGene.txt"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)
    def get_transcript(name):
        return transcripts.get(name)
    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        HGVS, genome, get_transcript=get_transcript)
    genome_coordinate = chrom + ":" + str(offset) + ":" + ref + ">" + alt
    return genome_coordinate
Beispiel #8
0
def get_genome_coor(hgvs_c):
    genome = SequenceFileDB('data/hg19.fa')
    refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)
    
    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        hgvs_c, genome, get_transcript=get_transcript)
    return chrom + ":" + str(offset) + ":" + ref + ">" + alt
def HGVS_to_genome_coor(HGVS):
    try:
        chrm, pos, ref, alt = pyhgvs.parse_hgvs_name(
            HGVS, GENOME, get_transcript=get_transcript)
        chrm = chrm[3:]
        pos = str(pos)
        ref = ref.replace("-", "")
        alt = alt.replace("-", "")
        genome_coor = "_".join([chrm, pos, ref, alt])
    except(pyhgvs.InvalidHGVSName, NotImplementedError, ValueError, AssertionError):
        genome_coor = "not translated"
    return genome_coor
Beispiel #10
0
def egl_variants(filename):
    print('\
	\n----------------------------------\
	\n\
	\n           EGL Database           \
	\n\
	\n----------------------------------')

    # Load file
    egl = pd.read_csv(filename, usecols=range(0, 9), header=None)

    # Rename columns accordingly
    egl.columns = [
        'ID', 'gene', '', 'exon', 'hgvs', 'protein_change', 'sig',
        'last reviewed', 'alias listing'
    ]

    # Filter LAMA2
    lama2_egl = egl[egl.iloc[:, 1].str.match(r'LAMA2')]
    lama2_egl = lama2_egl.loc[:, ['ID', 'hgvs', 'sig']]
    lama2_egl = lama2_egl.replace({'sig': dsig})
    lama2_egl['ID'] = 'egl_' + lama2_egl['ID'].astype(str)

    genome = Fasta('reference/chr6.fa')

    # Converting HGVS to VCF genomic coordinates
    count = 0
    transcript = get_transcript('NM_000426')

    for i in lama2_egl.index:
        try:
            #CHROM, POS, REF, ALT = pyhgvs.parse_hgvs_name(lama2_egl.loc[i,'hgvs'], genome, get_transcript=get_transcript)
            CHROM, POS, REF, ALT = pyhgvs.parse_hgvs_name(
                lama2_egl.loc[i, 'hgvs'], genome, transcript)
            lama2_egl.loc[i, "CHROM"] = 6
            lama2_egl.loc[i, "POS"] = POS
            lama2_egl.loc[i, "REF"] = REF
            lama2_egl.loc[i, "ALT"] = ALT

        except:
            print('\nException:', lama2_egl.loc[i, 'hgvs'])
            count += 1
            print('Number of exceptions:', count)
            pass

    lama2_egl = lama2_egl.astype({'CHROM': 'int32', 'POS': 'int32'})

    # Pathogenic variants
    #pathogenic_egl = lama2_egl[lama2_egl.sig.str.match(r'[Pp]athogenic')]

    #return [lama2_egl, pathogenic_egl]
    return lama2_egl
	def test_pyhgvs_cdna_coordinate_correct(self):
		for i in self.data:
			pyhgvs_coord = i['pyhgvs_Genomic_Coordinate_38']
			pyhgvs_cDNA = i['pyhgvs_cDNA']
			genome = SequenceFileDB('../reference_genome/hg38/hg38.fa')
			
			def get_transcript(name):
			    REFGENE = "../refgene38_brca.txt"
			    with open(REFGENE) as infile:
			        TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
			    return TRANSCRIPTS.get(name)

			chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(pyhgvs_cDNA, genome, 
				get_transcript=get_transcript)
			test_coord = chrom + ":" + "g." + str(offset) + ":" + ref + ">" + alt

			self.assertEqual(pyhgvs_coord, test_coord)
    def test_pyhgvs_cdna_coordinate_correct(self):
        for i in self.data:
            pyhgvs_coord = i['pyhgvs_Genomic_Coordinate_38']
            pyhgvs_cDNA = i['pyhgvs_cDNA']
            genome = SequenceFileDB('../reference_genome/hg38/hg38.fa')

            def get_transcript(name):
                REFGENE = "../refgene38_brca.txt"
                with open(REFGENE) as infile:
                    TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
                return TRANSCRIPTS.get(name)

            chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
                pyhgvs_cDNA, genome, get_transcript=get_transcript)
            test_coord = chrom + ":" + "g." + str(
                offset) + ":" + ref + ">" + alt

            self.assertEqual(pyhgvs_coord, test_coord)
Beispiel #13
0
 def check_hgvs(hgvs, submitterteam, submitter, file):
     try:
         chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
             str(hgvs), genome, get_transcript=get_transcript)
     except ValueError, e:  # 'falsche' HGVS-Codes überspringen und anzeigen
         print e, hgvs
         if file not in overview[submitterteam]['incorrect JSONs'].keys():
             overview[submitterteam]['incorrect JSONs'][file] = {}
         if 'falscher HGVS-Code' not in overview[submitterteam][
                 'incorrect JSONs'][file]:
             overview[submitterteam]['incorrect JSONs'][file][
                 'falscher HGVS-Code'] = {}
             overview[submitterteam]['incorrect JSONs'][file][
                 'submitter'] = submitter
             overview[submitterteam]['incorrect JSONs'][file][
                 'falscher HGVS-Code'][hgvs] = str(e)
         else:
             overview[submitterteam]['incorrect JSONs'][file][
                 'falscher HGVS-Code'][hgvs] = str(e)
Beispiel #14
0
    def hgvs_to_vcf(self, hgvs_variant):
        """
        Converts a single variant provided in HGVS notation to genomic coordinate notation.

        See Counsyl's HGVS library for more information on acceptable input formats: https://github.com/counsyl/hgvs.

        Args:
            hgvs_variant (str): HGVS description of variant, such as NM_001100.3:c.137T>C.
                The portion prior to the colon is the refseqID used as the reference for the variant The portion after
                the colon is an HGVS-style description of the mutation (a SNP from T to C at location 137 in the example above.

        Returns:
            tuple of str: (chromosome_number, coordinate, ref, alt) in that order denoting the VCF notation of the variant

        """

        # Library requires string not unicode, ensure format is correct
        hgvs_variant = str(hgvs_variant)

        chromosome_number, coordinate, ref, alt = pyhgvs.parse_hgvs_name(hgvs_variant, self.genome, get_transcript=self._get_transcript)
        chromosome_number = re.match('chr(.+)', chromosome_number).group(1)
        coordinate = str(coordinate)

        return chromosome_number, coordinate, ref, alt
Beispiel #15
0
 else:
     print 'keine Mutationen: ', file
     continue
 genotype = mutation['Test Information']['Genotype']
 if genotype == 'Hemizygous':
     genotype = '1'
 elif genotype == 'Homozygous':
     genotype = '1/1'
 elif genotype == 'Heterozygous' or genotype == 'Compound Heterozygous':
     genotype = '0/1'
 else:
     genotype = './1'
 for hgvscode in hgvslist:
     try:
         chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
             str(hgvscode),
             genome,
             get_transcript=get_transcript)
         if hgvscode in multivcf['NM'].tolist():
             index = multivcf['NM'].tolist().index(hgvscode)
             multivcf.set_value(index, caseID, genotype)
             if caseID not in jsonlist2:
                 jsonlist2.append(caseID)
         else:
             chromo = chrom.split('chr')[1]
             multivcf.set_value(x, '#CHROM', str(chromo))
             try:
                 multivcf.set_value(x, 'sort', int(chromo))
             except ValueError, e:
                 multivcf.set_value(x, 'sort', 30)
             multivcf.set_value(x, 'NM', hgvscode)
             multivcf.set_value(x, 'POS', offset)
Beispiel #16
0
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsPColumnName = 'HGVS_Protein'

    labelLine = brcaFile.readline().rstrip().split('\t')
    writeLine = '\t'.join(labelLine) + '\n'
    outputFile.writelines(writeLine)

    # Store indexes of the relevant columns
    hgvsG36Index = labelLine.index(hgvsG36ColumnName)
    hgvsG37Index = labelLine.index(hgvsG37ColumnName)
    hgvsG38Index = labelLine.index(hgvsG38ColumnName)
    refSeqIndex = labelLine.index(refSeqColumnName)
    hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName)
    hgvsPIndex = labelLine.index(hgvsPColumnName)
    geneSymbolIndex = labelLine.index("Gene_Symbol")
    synonymIndex = labelLine.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in brcaFile:
        parsedLine = line.rstrip().split('\t')

        if parsedLine[geneSymbolIndex] == 'BRCA1':
            parsedLine[refSeqIndex] = 'NM_007294.3'
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            parsedLine[refSeqIndex] = 'NM_000059.3'

        # Format genomic variant position strings to contain relevant refseq strings
        oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG36Index]
        oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG37Index]
        oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG38Index].split(',')[0]
        oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex]

        chrom38 = oldHgvsGenomic38.split(':')[1]
        offset38 = oldHgvsGenomic38.split(':')[2]
        ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0]
        alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''

        transcript38 = get_transcript38(parsedLine[refSeqIndex])
        transcript37 = get_transcript37(parsedLine[refSeqIndex])
        transcript36 = get_transcript36(parsedLine[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        cdna_coord = str(
            pyhgvs.format_hgvs_name(chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))

        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        synonymString = []
        if parsedLine[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        if calcProtein == True:
            #print('oldHgvsGenomic38:', oldHgvsGenomic38)
            #print('oldHgvsCDNA: ', oldHgvsCDNA)
            #print('cdna: ', cdna_coord)

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                print('hgvs.exceptions.HGVSParseError: ', e)
                print(
                    'GRCh38 Genomic change: ',
                    '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38))
                print('')
            #print('oldProtein: ', parsedLine[hgvsPIndex])
            #print('protein:', protein_coord)
            #print('')

        # write new data into line
        parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format(
            chrom36, offset36, ref36, alt36)
        parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format(
            chrom37, offset37, ref37, alt37)
        parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format(
            chrom38, offset38, ref38, alt38)
        parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord))
        parsedLine[synonymIndex] = ','.join(synonymString)
        writeLine = '\t'.join(parsedLine) + '\n'
        outputFile.writelines(writeLine)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
    outputFile.close()
Beispiel #17
0
def main(args):

    options = parse_args()

    hdp = hgvs.dataproviders.uta.connect()
    am38 = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38')
    hn = hgvs.normalizer.Normalizer(hdp)
    hp = hgvs.parser.Parser()
    # Read genome sequence using pyfaidx
    genome = Fasta(options.refFASTA)

    # Read RefSeq transcripts into a python dict.
    with open(options.refSEQ) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    # Provide a callback for fetching a transcript by its name.
    def get_transcript(name):
        return transcripts.get(name)

    babelfish38 = Babelfish(hdp, assembly_name="GRCh38")

    ## extract base variant representation
    with open(options.inVCF, 'rb') as in_vcf, open(options.outVCF,
                                                   'w') as out_vcf:
        vcf_reader = vcf.Reader(in_vcf)
        vcf_writer = vcf.Writer(out_vcf, vcf_reader)
        for record in vcf_reader:
            # Convert variants for indel HGVS representation
            chrom, offset, ref, alt = (str(record.CHROM), record.POS,
                                       str(record.REF), str(record.ALT[0]))
            print('chrom: {}, offset: {}, ref: {}, alt: {}'.format(
                chrom, offset, ref, alt))
            if 'chr13' in record.CHROM:
                transcript_id = "NM_000059.3"
            elif 'chr17' in record.CHROM:
                transcript_id = "NM_007294.4"
            transcript = get_transcript(transcript_id)
            try:
                hgvs_name = pyhgvs.format_hgvs_name(chrom,
                                                    offset,
                                                    ref,
                                                    alt,
                                                    genome,
                                                    transcript,
                                                    use_gene=False,
                                                    max_allele_length=50000)
                hgvs_c = hp.parse_hgvs_variant(hgvs_name)
                if len(ref) == len(alt) and len(ref) == 1:
                    # Variant is a SNP, normalize using hgvs Normalizer function
                    if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3'
                    norm_hgvs_c = hn.normalize(hgvs_c)
                    if 'chr17' in record.CHROM: norm_hgvs_c.ac = 'NM_007294.4'
                    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
                        str(norm_hgvs_c),
                        genome,
                        normalize=False,
                        get_transcript=get_transcript)
                else:
                    # Variant is an INDEL, normalize using hgvs babelfish38.hgvs_to_vcf function
                    if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3'
                    hgvs_g = am38.c_to_g(hgvs_c)
                    vcf_values = babelfish38.hgvs_to_vcf(hgvs_g)
                    chrom, offset, ref, alt = 'chr{}'.format(
                        vcf_values[0]
                    ), vcf_values[1], vcf_values[2], vcf_values[3]
            except hgvs.exceptions.HGVSUnsupportedOperationError as e:
                print(
                    'hgvs.exceptions.HGVSUnsupportedOperationError: {}'.format(
                        e))
            except hgvs.exceptions.HGVSInvalidIntervalError as e:
                print('hgvs.exceptions.HGVSInvalidIntervalError: {}'.format(e))
            except hgvs.exceptions.HGVSInvalidVariantError as e:
                print('hgvs.exceptions.HGVSInvalidVariantError: {}'.format(e))
            except AttributeError as e:
                print('AttributeError: {}'.format(e))
            except KeyError as e:
                print('KeyError: {}'.format(e))
            # Update and write the new normalized record
            record.POS = offset
            record.REF = ref
            record.ALT = [alt]
            vcf_writer.write_record(record)
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path,
                        filemode="w",
                        level=logging.DEBUG)

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = [
        "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
        "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
        "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"
    ]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:

        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(
            ',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" %
                  (line))
            continue
        cdna_coord = str(
            pyhgvs.format_hgvs_name("chr" + chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(
                cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
            if line[geneSymbolIndex] == 'BRCA1':
                for transcriptName in refSeqBRCA1Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)
            elif line[geneSymbolIndex] == 'BRCA2':
                for transcriptName in refSeqBRCA2Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)

        if calcProtein == True:

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print('hgvs.exceptions.HGVSParseError: ', e)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

            # Catch parse errors thrown by ometa.runtime.ParseError.
            except ParseError as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print(message)
                print('ometa.runtime.ParseError', ex)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom36, offset36, ref36, alt36)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom37, offset37, ref37, alt37)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom38, offset38, ref38, alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(
            int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(
            int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(
                str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG)

    hgvs_parser = hgvs.parser.Parser()
    hgvs_dp = hgvs.dataproviders.uta.connect()
    hgvs_norm = hgvs.normalizer.Normalizer(hgvs_dp)
    hgvs_am = hgvs.assemblymapper.AssemblyMapper(hgvs_dp, assembly_name='GRCh38')

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
                          "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
                          "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1']
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:
        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" % (line))
            continue
        cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            cdna_coord_LOVD = cdna_coord_LOVD.strip()

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            try:
                chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
                if line[geneSymbolIndex] == 'BRCA1':
                    for transcriptName in refSeqBRCA1Transcripts:
                        transcript38 = get_transcript38(transcriptName)
                        cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                        if cdna_synonym not in synonymString:
                            synonymString.append(cdna_synonym)
                elif line[geneSymbolIndex] == 'BRCA2':
                    for transcriptName in refSeqBRCA2Transcripts:
                        transcript38 = get_transcript38(transcriptName)
                        cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                        if cdna_synonym not in synonymString:
                            synonymString.append(cdna_synonym)
            except Exception as e:
                print('parse error: {}'.format(cdna_coord_LOVD))
                print(e)

        protein_coord = None
        if calcProtein:
            try:
                genomic_change = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)
                var_c1 = hgvs_parser.parse_hgvs_variant(cdna_coord)
                var_c1_norm = hgvs_norm.normalize(var_c1) # doing normalization explicitly to get a useful error message
                protein_coord = hgvs_am.c_to_p(var_c1_norm)
            except Exception as e:
                template = "An error of type {0} occured. Arguments:{1!r}"
                error_name = type(e).__name__
                message = template.format(error_name, e.args)
                logging.error(message)
                logging.error('Proposed GRCh38 Genomic change for error: %s', genomic_change)
                logging.error(line)

                # Exceptions related to invalid data
                data_errors = set(['HGVSParseError', 'HGVSError', 'HGVSInvalidVariantError', 'HGVSUnsupportedOperationError'])
                if error_name not in data_errors:
                    # output some more if exception doesn't seem to be related to invalid data
                    logging.error("Non data error raised")
                    logging.exception(message)

                if error_name == "DatabaseError":
                    # Aborting, as it is a transient error in principle, i.e. in one run we might be able to obtain a protein change, in another not, messing up the data diffs
                    raise EnvironmentError("Issue with UTA database. Aborting")

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
Beispiel #20
0
#!/usr/bin/env python

import argparse
import pyhgvs as hgvs
import hgvs.utils as hgvs_utils

from pygr.seqdb import SequenceFileDB


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', help="Input file")
    parser.add_argument('-g', '--genome', help="Genome reference FASTA")
    parser.add_argument('-r', '--refgene', help="RefGene Transcripts")
    parser.add_argument('-o', '--outfile', help="Output file name")

    args = parser.parse_args()

    genome = SequenceFileDB(args.genome)

    with open(args.refgene) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    # Provide a callback for fetching a transcript by its name.
    def get_transcript(name):
        return transcripts.get(name)


    chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G', genome, get_transcript=get_transcript)
Beispiel #21
0
def lovd_variants(filename):

    print('\
	\n--------------------------------------\
	\n\
	\n             LOVD Database            \
	\n\
	\n--------------------------------------')

    # Load the file
    lovd_ori = pd.read_csv(filename,
                           header=0,
                           quotechar='"',
                           doublequote=True,
                           sep="\t",
                           skiprows=4110,
                           nrows=2055)

    # Cleaning up the names of columns
    lovd_ori.columns = lovd_ori.columns.str.replace("['\{{','\}}']", '')
    lovd_ori.columns = lovd_ori.columns.str.replace("VariantOnGenome/", '')
    lovd_ori = lovd_ori.drop_duplicates(subset='DNA/hg38')
    lovd_ori['DNA/hg38'] = "NC_000006.12:" + lovd_ori['DNA/hg38'].astype(str)

    lovd = lovd_ori.rename(columns={
        'DBID': 'ID',
        'DNA/hg38': 'hgvs',
        'ClinicalClassification': 'sig'
    })

    # Selecting important columns
    lovd = lovd.loc[:, ['ID', 'hgvs', 'sig', 'type']]
    lovd = lovd.replace({'type': dtype})

    genome = Fasta('reference/NC_000006.fa')

    # Converting HGVS to VCF genomic coordinates
    count = 0

    for i in lovd.index:
        try:
            #print('Trying %d\n' %(lovd.loc[i,'hgvs']))
            CHROM, POS, REF, ALT = pyhgvs.parse_hgvs_name(
                lovd.loc[i, 'hgvs'], genome)
            lovd.loc[i, 'CHROM'] = 6
            lovd.loc[i, "POS"] = POS
            lovd.loc[i, "REF"] = REF
            lovd.loc[i, "ALT"] = ALT
        except:
            print('\nException:', lovd.loc[i, 'hgvs'])
            count += 1
            print('Number of exceptions:', count)
            pass

    # Filtering pathogenic variants
    #pathogenic_lovd = lovd[lovd.sig.str.match(r'.*[Pp]athogenic.*', na=False)]

    #return [lovd, pathogenic_lovd]
    lovd.dropna(inplace=True)
    lovd = lovd.astype({'CHROM': 'int32', 'POS': 'int32'})

    #print(lovd)

    return lovd
import pyximport
pyximport.install()
import pyhgvs as hgvs
import pyhgvs.utils as hgvs_utils 
import pygr
from pygr.seqdb import SequenceFileDB  
from optparse import OptionParser

parser = OptionParser()
parser.add_option("-i", "--input_string", dest="instr", help="format NM_007294.3:c.8T>G", default="")
(options, args) = parser.parse_args()
instr = options.instr

# Read genome sequence using pygr. 
fn_hg19='/raw/human_sequence_reference/UCSC/hg19.fa'
fn_refgene='/home/david/software/hgvs-master/pyhgvs/data/genes.refGene'
genome = SequenceFileDB(fn_hg19)
def get_transcript(name):
    return transcripts.get(name)

with open(fn_refgene) as infile:
    transcripts = hgvs_utils.read_transcripts(infile)

out = hgvs.parse_hgvs_name(instr,genome,get_transcript=get_transcript)  
print( "\t".join( [str(x) for x in out] ) )



def main():
    options = parse_args()
    inputFile = options.input
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    source = options.source
    logfile = options.logfile

    if options.verbose:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.CRITICAL

    logging.basicConfig(filename=logfile, filemode="w", level=logging_level)

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    print('##source={0}'.format(source), file=vcfFile)
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(
            annotation.replace(' ', '_'), description),
              file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = inputFile.readline().strip().replace(' ', '_').replace(
        '"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the flat file
    for line in inputFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']]
        if hgvsName == '-':
            logging.debug("hgvs name == '-' for line: %s", parsedLine)
            continue
        gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower()
        if gene_symbol == 'brca1':
            transcript = 'NM_007294.3'
        elif gene_symbol == 'brca2':
            transcript = 'NM_000059.3'
        else:
            logging.debug("improper gene symbol: %s", gene_symbol)
            continue
        queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(
                queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(
                chrom, offset, queryHgvsName, ref, alt, INFO_field_string),
                  file=vcfFile)
        except Exception as e:
            logging.debug("could not parse hgvs field: %s", queryHgvsName)
Beispiel #24
0
# Read RefSeq transcripts into a python dict.
# The RefSeq transcripts can be downloaded from: https://github.com/counsyl/hgvs/blob/master/pyhgvs/data/genes.refGene
with open('references/genes.refGene') as infile:
    transcripts = read_transcripts(infile)


# Provide a callback for fetching a transcript by its name.
def get_transcript(name):
    return transcripts.get(name)


# Store the variant information in a list
vcf = []
for v in data:
    chrom, offset, ref, alt = hgvs.parse_hgvs_name(
        gene + ':' + v, genome, get_transcript=get_transcript)
    vcf.append([chrom, offset, ref, alt])

# Define reference genome
ref_genome = [
    'reference=GRCh37/hg19', 'contig=<ID=1,length=249250621>',
    'contig=<ID=2,length=243199373>', 'contig=<ID=3,length=198022430>',
    'contig=<ID=4,length=191154276>', 'contig=<ID=5,length=180915260>',
    'contig=<ID=6,length=171115067>', 'contig=<ID=7,length=159138663>',
    'contig=<ID=8,length=146364022>', 'contig=<ID=9,length=141213431>',
    'contig=<ID=10,length=135534747>', 'contig=<ID=11,length=135006516>',
    'contig=<ID=12,length=133851895>', 'contig=<ID=13,length=115169878.',
    'contig=<ID=14,length=107349540>', 'contig=<ID=15,length=102531392>',
    'contig=<ID=16,length=90354753>', 'contig=<ID=17,length=81195210>',
    'contig=<ID=18,length=78077248>', 'contig=<ID=19,length=59128983>',
    'contig=<ID=20,length=63025520>', 'contig=<ID=21,length=48129895>',
Beispiel #25
0
    if '_' in cdot.split(':')[1]:
        return fix_alt_range(chrom, offset, ref, cdot)
    return fix_alt_single(chrom, offset, ref, cdot)


dat_file = '../../data/raw/UC_all_panel_variants_01_20_2016.xlsx'
df_pre = pandas.read_excel(dat_file)
crit = df_pre.apply(
    lambda row: 'ins' in row['Alt'] and row['Gene Symbol'] in FOCUS_GENES,
    axis=1)
df = df_pre[crit]
df.loc[:, 'id'] = df.apply(
    lambda row: row['Transcript'] + ':' + row['Pos'] + row['Alt'], axis=1)
vals = list(df[['id', 'Classification']].values)
for val in vals:
    #    print(val)
    v, c = val
    print(v, c)
    chrom, offset, ref, alt = hgvs.parse_hgvs_name(
        str(v), genome, get_transcript=get_transcript)
    print(v, c, chrom, offset, ref, alt)
    # h = str(hgvs.HGVSName(v)).split("'")[1]
#    print(v)
#    try:

# except:
#     print('FAIL', v, c)

# next convert to g. https://github.com/counsyl/hgvs
# need hg19
def main():
    options = parse_args()
    inputFile = options.input
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    source = options.source
    logfile = options.logfile

    if options.verbose:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.CRITICAL

    logging.basicConfig(filename=logfile, filemode="w", level=logging_level)

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    print('##source={0}'.format(source), file=vcfFile)
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the flat file
    for line in inputFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']]
        if hgvsName == '-':
            logging.debug("hgvs name == '-' for line: %s", parsedLine)
            continue
        gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower()
        if gene_symbol == 'brca1':
            transcript = 'NM_007294.3'
        elif gene_symbol == 'brca2':
            transcript = 'NM_000059.3'
        else:
            logging.debug("improper gene symbol: %s", gene_symbol)
            continue
        queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile)
        except Exception as e:
            logging.debug("could not parse hgvs field: %s", queryHgvsName)
Beispiel #27
0
def main(args):
    options = parse_args()
    exLOVDFile = options.inEXLOVD
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    errorsFile = options.errors

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    print('##source=exLOVD', file=vcfFile)
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(
            annotation.replace(' ', '_'), description),
              file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = exLOVDFile.readline().strip().replace(' ', '_').replace(
        '"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the bic flat file
    for line in exLOVDFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change.
        if 'cDNA' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['cDNA']]
        elif 'dna_change' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['dna_change']]
        else:
            sys.exit("ERROR: could not parse hgvs name.")
        if hgvsName == '-':
            print(parsedLine)
            continue
        queryHgvsName = hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(
                queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(
                chrom, offset, queryHgvsName, ref, alt, INFO_field_string),
                  file=vcfFile)
        except Exception as e:
            print(str(e) + ': could not parse hgvs field ' + queryHgvsName,
                  file=errorsFile)
from pygr.seqdb import SequenceFileDB


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', help="Input file")
    parser.add_argument('-g', '--genome', help="Genome reference FASTA")
    parser.add_argument('-r', '--refgene', help="RefGene Transcripts")
    parser.add_argument('-o', '--outfile', help="Output file name")

    args = parser.parse_args()

    genome = SequenceFileDB(args.genome)

    with open(args.refgene) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    # Provide a callback for fetching a transcript by its name.
    def get_transcript(name):
        return transcripts.get(name)

    with open(args.infile, 'r') as infile:
        with open(args.outfile, 'w') as outfile:
            reader = csv.reader(infile, dialect='excel-tab')
            header = reader.next()
            for row in reader:
                print row[1]
                chrom, offset, ref, alt = hgvs.parse_hgvs_name(row[1], genome, get_transcript=get_transcript)
                outfile.write("{}\t{}\t{}\t{}\n".format(chrom, offset, ref, alt))
Beispiel #29
0
def main():
    options = parse_args()
    inputFile = options.input
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    errorsFile = options.errors
    source = options.source

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    if source == "exLOVD":
        print('##source=exLOVD', file=vcfFile)
    elif source == "LOVD":
        print('##source=LOVD', file=vcfFile)
    else:
        raise ValueError('Source is %s, must be either LOVD or exLOVD' % (source))
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the flat file
    for line in inputFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change.
        if 'cDNA' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['cDNA']]
        elif 'dna_change' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['dna_change']]
        else:
            sys.exit("ERROR: could not parse hgvs name.")
        if hgvsName == '-':
            print(parsedLine)
            continue
        queryHgvsName = hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile)
        except Exception as e:
            print(str(e)+': could not parse hgvs field '+queryHgvsName, file=errorsFile)
def convert_HGVS(hgvs_c, GENOME):
    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        hgvs_c, GENOME, get_transcript=get_transcript)
    genome_coor = chrom + ":" + str(offset) + ":" + ref + ">" + alt
    HGVS_p = HGVS_cDNA_to_protein(hgvs_c)
    return genome_coor, HGVS_p
Beispiel #31
0
# Read genome sequence using pyfaidx.
genome = Genome('/tmp/hg19.fa')

# Read RefSeq transcripts into a python dict.
with open('pyhgvs/data/genes.refGene') as infile:
    transcripts = hgvs_utils.read_transcripts(infile)


# Provide a callback for fetching a transcript by its name.
def get_transcript(name):
    return transcripts.get(name)


# Parse the HGVS name into genomic coordinates and alleles.
chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G',
                                               genome,
                                               get_transcript=get_transcript)
print(chrom, offset, ref, alt)
# Returns variant in VCF style: ('chr11', 17496508, 'T', 'C')
# Notice that since the transcript is on the negative strand, the alleles
# are reverse complemented during conversion.

# Format an HGVS name.
chrom, offset, ref, alt = ('chr11', 17496508, 'T', 'C')
transcript = get_transcript('NM_000352.3')
hgvs_name = hgvs.format_hgvs_name(chrom, offset, ref, alt, genome, transcript)
print(hgvs_name)
# Returns 'NM_000352.3(ABCC8):c.215A>G'

hgvs_name = hgvs.HGVSName('NM_000352.3:c.215-10A>G')
# fields of the HGVS name are available as attributes:
from pygr.seqdb import SequenceFileDB

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--infile', help="Input file")
    parser.add_argument('-g', '--genome', help="Genome reference FASTA")
    parser.add_argument('-r', '--refgene', help="RefGene Transcripts")
    parser.add_argument('-o', '--outfile', help="Output file name")

    args = parser.parse_args()

    genome = SequenceFileDB(args.genome)

    with open(args.refgene) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    # Provide a callback for fetching a transcript by its name.
    def get_transcript(name):
        return transcripts.get(name)

    with open(args.infile, 'r') as infile:
        with open(args.outfile, 'w') as outfile:
            reader = csv.reader(infile, dialect='excel-tab')
            header = reader.next()
            for row in reader:
                print row[1]
                chrom, offset, ref, alt = hgvs.parse_hgvs_name(
                    row[1], genome, get_transcript=get_transcript)
                outfile.write("{}\t{}\t{}\t{}\n".format(
                    chrom, offset, ref, alt))