Python read_transcripts Beispiele, pyhgvs.utils.read_transcripts Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: enigma-processing.py Projekt: rcurrie/brca-exchange

def get_transcript(name):
    global REFGENE
    if REFGENE is None:
        sys.exit("No reference genome was provided. Try to locate hg38.BRCA.refGene.txt.")
    with open(REFGENE) as infile:
        TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
    return TRANSCRIPTS.get(name)

Beispiel #2

0

Datei anzeigen

def parse_hgvs(hgvs_name, fasta, genes):
    genome = Fasta(fasta, key_function=lambda x: 'chr{}'.format(x))

    with open(genes) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)

    return hgvs.parse_hgvs_name(hgvs_name, genome, get_transcript=get_transcript)

Beispiel #3

0

Datei anzeigen

Datei: enigma-processing.py Projekt: TIM245-W16/brca

def get_genome_coor(hgvs_c):
    genome = SequenceFileDB('data/hg19.fa')
    refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)
    
    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        hgvs_c, genome, get_transcript=get_transcript)
    return chrom + ":" + str(offset) + ":" + ref + ">" + alt

Beispiel #4

0

Datei anzeigen

Datei: clinvar_txt_udpate.py Projekt: uterald/brca-pipeline

def HGVS_to_GenomeCoor(HGVS):
    """use counsyl pyhgvs for this"""
    genome = SequenceFileDB('../data/hg19.fa')
    refGene = "../data/BRCA12.refGene.txt"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)
    def get_transcript(name):
        return transcripts.get(name)
    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        HGVS, genome, get_transcript=get_transcript)
    genome_coordinate = chrom + ":" + str(offset) + ":" + ref + ">" + alt
    return genome_coordinate

Beispiel #5

0

Datei anzeigen

def get_genome_coor(hgvs_c):
    genome = SequenceFileDB('data/hg19.fa')
    refGene = "/Users/Molly/Desktop/web-dev/hgvs_counsyl/hgvs/pyhgvs/data/genes.refGene"
    with open(refGene) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    def get_transcript(name):
        return transcripts.get(name)

    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
        hgvs_c, genome, get_transcript=get_transcript)
    return chrom + ":" + str(offset) + ":" + ref + ">" + alt

Beispiel #6

0

Datei anzeigen

Datei: hgvs_bfx.py Projekt: jjevans/me

	def __init__(self,ref_fa,ref_tr):
		#ref_tr is a file of transcripts formatted 
		# like the refGene.txt file from ucsc
		#ref_fa is a reference file (genome) in fasta format

		#reference sequence
		self.ref_fa = ref_fa		
		self.ref = SequenceFileDB(self.ref_fa)

		#refseq transcripts	
		self.ref_tr = ref_tr

		with open(self.ref_tr) as infile:
			self.tr = util.read_transcripts(infile)

Beispiel #7

0

Datei anzeigen

Datei: calcVarPriors.py Projekt: jcasalet/brca-exchange

def calc_all(variants, priors, genome, transcripts, processes):
    global brca1Transcript, brca2Transcript

    inputData = csv.DictReader(variants, delimiter="\t")
    fieldnames = inputData.fieldnames
    newHeaders = open("headers.tsv", "r").read().split()
    for header in newHeaders:
        fieldnames.append(header)
    outputData = csv.DictWriter(priors, delimiter="\t", lineterminator="\n", fieldnames=fieldnames)
    outputData.writerow(dict((fn, fn) for fn in inputData.fieldnames))

    # read RefSeq transcripts
    transcripts = pyhgvs_utils.read_transcripts(transcripts)

    brca1Transcript = transcripts.get(BRCA1_RefSeq)
    brca2Transcript = transcripts.get(BRCA2_RefSeq)

    if processes > 1:
        # Create a pool of processes and calculate in parallel
        click.echo("Processing using {} processes".format(processes), err=True)
        pool = multiprocessing.Pool(processes)
        try:
            # Normal map has a bug if there is no timout that prevents Keyboard interrupts:
            # https://stackoverflow.com/questions/1408356/keyboard-interrupts-with-pythons-multiprocessing-pool/1408476#1408476

            # calc_one_partial = functools.partial(calc_one, brca1=brca1Transcript, brca2=brca2Transcript)
            calculatedVariants = pool.map_async(calc_one, list(inputData)).get(99999999)

            # Sort output as the order of p.map is not deterministic
            outputData.writerows(sorted(
                calculatedVariants,
                key=lambda d: "{0}:g.{1}:{2}>{3}".format(d["Chr"], d["Pos"], d["Ref"], d["Alt"])))
        except KeyboardInterrupt:
            pool.terminate()
    else:
        outputData.writerows(sorted(
            map(calc_one, inputData),
            key=lambda d: "{0}:g.{1}:{2}>{3}".format(d["Chr"], d["Pos"], d["Ref"], d["Alt"])
        ))

Beispiel #8

0

Datei anzeigen

def add_HGVS_c_counsyl(in_path, out_path):
    # counsyl pyhgvs setups
    with open('../data/BRCA12.refGene.txt') as infile:
        transcripts_counsyl = pyhgvs_utils.read_transcripts(infile)
    def get_transcript(name):
        return transcripts_counsyl.get(name)

    f_in = open(in_path, "r")
    f_out = open(out_path, "w")
    line_num = 0
    unmatching_cases = 0
    for line in f_in:
        line_num += 1
        print line_num
        if line_num == 1:
            f_out.write(line)
            continue
        items = line.strip().split("\t")
        genome_coors = items[2].split(":")
        chrom = genome_coors[0]
        offset = int(genome_coors[1])
        ref = genome_coors[2].split(">")[0]
        alt = genome_coors[2].split(">")[1]
        transcript_id = items[4]
        transcript = get_transcript(transcript_id)
        hgvs_name_with_transcript = pyhgvs.variant_to_hgvs_name(
            chrom, offset, ref, alt, GENOME, transcript)
        hgvs_c = hgvs_name_with_transcript.format(use_prefix=False,
                                                  use_gene=False)
        if items[5] == "-":
            items[5] = hgvs_c
        elif items[5] == hgvs_c:
            pass
        else:
            unmatching_cases += 1
        new_line = "\t".join(items) + "\n"
        f_out.write(new_line)
    print "unmatching cases: ", unmatching_cases

Beispiel #9

0

Datei anzeigen

Datei: lovd2vcf.py Projekt: BD2KGenomics/brca-exchange

def main():
    options = parse_args()
    inputFile = options.input
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    errorsFile = options.errors
    source = options.source

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    if source == "exLOVD":
        print('##source=exLOVD', file=vcfFile)
    elif source == "LOVD":
        print('##source=LOVD', file=vcfFile)
    else:
        raise ValueError('Source is %s, must be either LOVD or exLOVD' % (source))
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the flat file
    for line in inputFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change.
        if 'cDNA' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['cDNA']]
        elif 'dna_change' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['dna_change']]
        else:
            sys.exit("ERROR: could not parse hgvs name.")
        if hgvsName == '-':
            print(parsedLine)
            continue
        queryHgvsName = hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile)
        except Exception as e:
            print(str(e)+': could not parse hgvs field '+queryHgvsName, file=errorsFile)

Beispiel #10

0

Datei anzeigen

#Load the data
df = pd.read_excel(io=path,
                   sheet_name=genename + '_' + variants,
                   engine='openpyxl')
data = []
for name in df['cDNA variant']:
    data.append(name)

#read the genome
# the reference genome can be downloaded from: http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/
genome = Fasta('references/hg19.fa')

# Read RefSeq transcripts into a python dict.
# The RefSeq transcripts can be downloaded from: https://github.com/counsyl/hgvs/blob/master/pyhgvs/data/genes.refGene
with open('references/genes.refGene') as infile:
    transcripts = read_transcripts(infile)


# Provide a callback for fetching a transcript by its name.
def get_transcript(name):
    return transcripts.get(name)


# Store the variant information in a list
vcf = []
for v in data:
    chrom, offset, ref, alt = hgvs.parse_hgvs_name(
        gene + ':' + v, genome, get_transcript=get_transcript)
    vcf.append([chrom, offset, ref, alt])

# Define reference genome

Beispiel #11

0

Datei anzeigen

Datei: lovd2vcf.py Projekt: rcurrie/brca-exchange

def main(args):
    options = parse_args()
    exLOVDFile = options.inEXLOVD
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    errorsFile = options.errors

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    print('##source=exLOVD', file=vcfFile)
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(
            annotation.replace(' ', '_'), description),
              file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = exLOVDFile.readline().strip().replace(' ', '_').replace(
        '"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the bic flat file
    for line in exLOVDFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        # Sometimes dna_change is in the field cDNA, sometimes it's labeled dna_change.
        if 'cDNA' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['cDNA']]
        elif 'dna_change' in fieldIdxDict:
            hgvsName = parsedLine[fieldIdxDict['dna_change']]
        else:
            sys.exit("ERROR: could not parse hgvs name.")
        if hgvsName == '-':
            print(parsedLine)
            continue
        queryHgvsName = hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(
                queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(
                chrom, offset, queryHgvsName, ref, alt, INFO_field_string),
                  file=vcfFile)
        except Exception as e:
            print(str(e) + ': could not parse hgvs field ' + queryHgvsName,
                  file=errorsFile)

Beispiel #12

0

Datei anzeigen

Datei: testloadingscript.py Projekt: tydymy/brca-exchange

 def get_transcript(name):
     REFGENE = "../refgene38_brca.txt"
     with open(REFGENE) as infile:
         TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
     return TRANSCRIPTS.get(name)

Beispiel #13

0

Datei anzeigen

Datei: functional_assays_to_vcf.py Projekt: jcasalet/brca-exchange

def main():
    options = parse_args()
    inputFile = options.input
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    source = options.source
    logfile = options.logfile

    if options.verbose:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.CRITICAL

    logging.basicConfig(filename=logfile, filemode="w", level=logging_level)

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    print('##source={0}'.format(source), file=vcfFile)
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(
            annotation.replace(' ', '_'), description),
              file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = inputFile.readline().strip().replace(' ', '_').replace(
        '"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the flat file
    for line in inputFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']]
        if hgvsName == '-':
            logging.debug("hgvs name == '-' for line: %s", parsedLine)
            continue
        gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower()
        if gene_symbol == 'brca1':
            transcript = 'NM_007294.3'
        elif gene_symbol == 'brca2':
            transcript = 'NM_000059.3'
        else:
            logging.debug("improper gene symbol: %s", gene_symbol)
            continue
        queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(
                queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(
                chrom, offset, queryHgvsName, ref, alt, INFO_field_string),
                  file=vcfFile)
        except Exception as e:
            logging.debug("could not parse hgvs field: %s", queryHgvsName)

Beispiel #14

0

Datei anzeigen

config = configparser.ConfigParser()
config.read(BinPath + '/config.ini')
omim_dict = read_morbidmap(BinPath + '/' + config['DEFAULT']['morbidmap'])
pathogenic_dict, pathogenic_dict2 = read_pathogenic_site(
    BinPath + '/' + config['DEFAULT']['pathogenic_ref'])
ba1_exception = read_ba1_exception(BinPath + '/' +
                                   config['DEFAULT']['ba1_exception'])
pvs1_levels = read_pvs1_levels(BinPath + '/' + config['DEFAULT']['pvs1levels'])

domain_bed = create_bed_dict(BinPath + '/' + config['DEFAULT']['domain'])
hotspot_bed = create_bed_dict(BinPath + '/' + config['DEFAULT']['hotspot'])
curated_region = create_bed_dict(BinPath + '/' +
                                 config['DEFAULT']['curated_region'])
exon_lof_frequent = create_bed_dict(BinPath + '/' +
                                    config['DEFAULT']['exon_lof_frequent'])

genome = Fasta(BinPath + '/' + config['DEFAULT']['ref'])
with open(BinPath + '/' + config['DEFAULT']['trans']) as gpefile:
    transcripts = read_transcripts(gpefile)

gene_trans = {}
trans_gene = {}
with open(BinPath + '/' + config['DEFAULT']['trans']) as f:
    for line in f:
        record = line.strip().split("\t")
        gene = record[12]
        trans = record[1]
        gene_trans[gene] = trans
        trans_gene[trans] = gene

Beispiel #15

0

Datei anzeigen

def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsPColumnName = 'HGVS_Protein'

    labelLine = brcaFile.readline().rstrip().split('\t')
    writeLine = '\t'.join(labelLine) + '\n'
    outputFile.writelines(writeLine)

    # Store indexes of the relevant columns
    hgvsG36Index = labelLine.index(hgvsG36ColumnName)
    hgvsG37Index = labelLine.index(hgvsG37ColumnName)
    hgvsG38Index = labelLine.index(hgvsG38ColumnName)
    refSeqIndex = labelLine.index(refSeqColumnName)
    hgvsCDNAIndex = labelLine.index(hgvsCDNAColumnName)
    hgvsPIndex = labelLine.index(hgvsPColumnName)
    geneSymbolIndex = labelLine.index("Gene_Symbol")
    synonymIndex = labelLine.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in brcaFile:
        parsedLine = line.rstrip().split('\t')

        if parsedLine[geneSymbolIndex] == 'BRCA1':
            parsedLine[refSeqIndex] = 'NM_007294.3'
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            parsedLine[refSeqIndex] = 'NM_000059.3'

        # Format genomic variant position strings to contain relevant refseq strings
        oldHgvsGenomic36 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG36Index]
        oldHgvsGenomic37 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG37Index]
        oldHgvsGenomic38 = parsedLine[refSeqIndex] + ':' + parsedLine[
            hgvsG38Index].split(',')[0]
        oldHgvsCDNA = parsedLine[refSeqIndex] + ':' + parsedLine[hgvsCDNAIndex]

        chrom38 = oldHgvsGenomic38.split(':')[1]
        offset38 = oldHgvsGenomic38.split(':')[2]
        ref38 = oldHgvsGenomic38.split(':')[3].split('>')[0]
        alt38 = oldHgvsGenomic38.split(':')[3].split('>')[1]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''

        transcript38 = get_transcript38(parsedLine[refSeqIndex])
        transcript37 = get_transcript37(parsedLine[refSeqIndex])
        transcript36 = get_transcript36(parsedLine[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        cdna_coord = str(
            pyhgvs.format_hgvs_name(chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))

        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        synonymString = []
        if parsedLine[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif parsedLine[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        if calcProtein == True:
            #print('oldHgvsGenomic38:', oldHgvsGenomic38)
            #print('oldHgvsCDNA: ', oldHgvsCDNA)
            #print('cdna: ', cdna_coord)

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                print('hgvs.exceptions.HGVSParseError: ', e)
                print(
                    'GRCh38 Genomic change: ',
                    '{0}:{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38))
                print('')
            #print('oldProtein: ', parsedLine[hgvsPIndex])
            #print('protein:', protein_coord)
            #print('')

        # write new data into line
        parsedLine[hgvsG36Index] = '{0}:{1}:{2}>{3}'.format(
            chrom36, offset36, ref36, alt36)
        parsedLine[hgvsG37Index] = '{0}:{1}:{2}>{3}'.format(
            chrom37, offset37, ref37, alt37)
        parsedLine[hgvsG38Index] = '{0}:{1}:{2}>{3}'.format(
            chrom38, offset38, ref38, alt38)
        parsedLine[hgvsCDNAIndex] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            parsedLine[hgvsPIndex] = '{0}'.format(str(protein_coord))
        parsedLine[synonymIndex] = ','.join(synonymString)
        writeLine = '\t'.join(parsedLine) + '\n'
        outputFile.writelines(writeLine)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()
    outputFile.close()

Beispiel #16

0

Datei anzeigen

Datei: brca_pseudonym_generator.py Projekt: meredith705/brca-exchange

def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path,
                        filemode="w",
                        level=logging.DEBUG)

    hdp = hgvs_dataproviders_uta.connect()
    variantmapper = hgvs_variantmapper.EasyVariantMapper(hdp)
    hgvsparser = hgvs_parser.Parser()

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = [
        "pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
        "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
        "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"
    ]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = [
        'NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3',
        'NM_007297.3', 'U14680.1'
    ]
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:

        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(
            ',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" %
                  (line))
            continue
        cdna_coord = str(
            pyhgvs.format_hgvs_name("chr" + chrom38,
                                    int(offset38),
                                    ref38,
                                    alt38,
                                    genome38,
                                    transcript38,
                                    use_gene=False,
                                    max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(
            cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(
                    pyhgvs.format_hgvs_name(chrom38,
                                            int(offset38),
                                            ref38,
                                            alt38,
                                            genome38,
                                            transcript38,
                                            use_gene=False,
                                            max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(
                cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
            if line[geneSymbolIndex] == 'BRCA1':
                for transcriptName in refSeqBRCA1Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)
            elif line[geneSymbolIndex] == 'BRCA2':
                for transcriptName in refSeqBRCA2Transcripts:
                    transcript38 = get_transcript38(transcriptName)
                    cdna_synonym = str(
                        pyhgvs.format_hgvs_name(chrom38LOVD,
                                                int(offset38LOVD),
                                                ref38LOVD,
                                                alt38LOVD,
                                                genome38,
                                                transcript38,
                                                use_gene=False,
                                                max_allele_length=100))
                    if cdna_synonym not in synonymString:
                        synonymString.append(cdna_synonym)

        if calcProtein == True:

            try:
                var_c1 = hgvsparser.parse_hgvs_variant(cdna_coord)
                protein_coord = variantmapper.c_to_p(var_c1)
            except hgvs.exceptions.HGVSParseError as e:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print('hgvs.exceptions.HGVSParseError: ', e)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

            # Catch parse errors thrown by ometa.runtime.ParseError.
            except ParseError as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                genomicChange = '{0}:g.{1}:{2}>{3}'.format(
                    chrom38, offset38, ref38, alt38)
                print(message)
                print('ometa.runtime.ParseError', ex)
                print('Original GRCh38 Genomic Coordinate: ', oldHgvsGenomic38)
                print('GRCh38 Genomic change: ', genomicChange)
                logging.error(message)
                logging.error(line)
                logging.error('Proposed GRCh38 Genomic change for error: %s',
                              genomicChange)

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom36, offset36, ref36, alt36)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom37, offset37, ref37, alt37)
        line[output_header_row.index(
            "pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(
                chrom38, offset38, ref38, alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(
            int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(
            int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(
                str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()

Beispiel #17

0

Datei anzeigen

Datei: testloadingscript.py Projekt: BD2KGenomics/brca-exchange

			def get_transcript(name):
			    REFGENE = "../refgene38_brca.txt"
			    with open(REFGENE) as infile:
			        TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
			    return TRANSCRIPTS.get(name)

Beispiel #18

0

Datei anzeigen

def main(args):

    options = parse_args()

    hdp = hgvs.dataproviders.uta.connect()
    am38 = hgvs.assemblymapper.AssemblyMapper(hdp, assembly_name='GRCh38')
    hn = hgvs.normalizer.Normalizer(hdp)
    hp = hgvs.parser.Parser()
    # Read genome sequence using pyfaidx
    genome = Fasta(options.refFASTA)

    # Read RefSeq transcripts into a python dict.
    with open(options.refSEQ) as infile:
        transcripts = pyhgvs_utils.read_transcripts(infile)

    # Provide a callback for fetching a transcript by its name.
    def get_transcript(name):
        return transcripts.get(name)

    babelfish38 = Babelfish(hdp, assembly_name="GRCh38")

    ## extract base variant representation
    with open(options.inVCF, 'rb') as in_vcf, open(options.outVCF,
                                                   'w') as out_vcf:
        vcf_reader = vcf.Reader(in_vcf)
        vcf_writer = vcf.Writer(out_vcf, vcf_reader)
        for record in vcf_reader:
            # Convert variants for indel HGVS representation
            chrom, offset, ref, alt = (str(record.CHROM), record.POS,
                                       str(record.REF), str(record.ALT[0]))
            print('chrom: {}, offset: {}, ref: {}, alt: {}'.format(
                chrom, offset, ref, alt))
            if 'chr13' in record.CHROM:
                transcript_id = "NM_000059.3"
            elif 'chr17' in record.CHROM:
                transcript_id = "NM_007294.4"
            transcript = get_transcript(transcript_id)
            try:
                hgvs_name = pyhgvs.format_hgvs_name(chrom,
                                                    offset,
                                                    ref,
                                                    alt,
                                                    genome,
                                                    transcript,
                                                    use_gene=False,
                                                    max_allele_length=50000)
                hgvs_c = hp.parse_hgvs_variant(hgvs_name)
                if len(ref) == len(alt) and len(ref) == 1:
                    # Variant is a SNP, normalize using hgvs Normalizer function
                    if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3'
                    norm_hgvs_c = hn.normalize(hgvs_c)
                    if 'chr17' in record.CHROM: norm_hgvs_c.ac = 'NM_007294.4'
                    chrom, offset, ref, alt = pyhgvs.parse_hgvs_name(
                        str(norm_hgvs_c),
                        genome,
                        normalize=False,
                        get_transcript=get_transcript)
                else:
                    # Variant is an INDEL, normalize using hgvs babelfish38.hgvs_to_vcf function
                    if 'chr17' in record.CHROM: hgvs_c.ac = 'NM_007294.3'
                    hgvs_g = am38.c_to_g(hgvs_c)
                    vcf_values = babelfish38.hgvs_to_vcf(hgvs_g)
                    chrom, offset, ref, alt = 'chr{}'.format(
                        vcf_values[0]
                    ), vcf_values[1], vcf_values[2], vcf_values[3]
            except hgvs.exceptions.HGVSUnsupportedOperationError as e:
                print(
                    'hgvs.exceptions.HGVSUnsupportedOperationError: {}'.format(
                        e))
            except hgvs.exceptions.HGVSInvalidIntervalError as e:
                print('hgvs.exceptions.HGVSInvalidIntervalError: {}'.format(e))
            except hgvs.exceptions.HGVSInvalidVariantError as e:
                print('hgvs.exceptions.HGVSInvalidVariantError: {}'.format(e))
            except AttributeError as e:
                print('AttributeError: {}'.format(e))
            except KeyError as e:
                print('KeyError: {}'.format(e))
            # Update and write the new normalized record
            record.POS = offset
            record.REF = ref
            record.ALT = [alt]
            vcf_writer.write_record(record)

Beispiel #19

0

Datei anzeigen

Datei: enigma-processing.py Projekt: BD2KGenomics/brca-pipeline

def get_transcript(name):
    REFGENE = "../resources/refseq/hg38.BRCA.refGene.txt"
    with open(REFGENE) as infile:
        TRANSCRIPTS = pyhgvs_utils.read_transcripts(infile)
    return TRANSCRIPTS.get(name)

Beispiel #20

0

Datei anzeigen

import toolshed as ts
from collections import defaultdict
import pyhgvs as hgvs
from pygr.seqdb import SequenceFileDB
from pyhgvs.utils import read_transcripts
import sys


var=sys.argv[1] #'ogfiles/fetalgenomevariants.txt'
trans=sys.argv[2] #'ogfiles/fetaltranscripts.txt'
f=open(sys.argv[3],'w') #'fetalvariants.vcf'

infile='Homo_sapiens.GRCh37.75.genePred' #obtained by running gtfToGenePred UCSC binary on the GTF of GRCh37 from ENSEMBL. added 0 as placeholder undocumented id field to the front of each line using sed
with open(infile) as reffile:
    transcripts=read_transcripts(reffile)
def get_transcript(name):
    return transcripts.get(name)
def translate(variant,transcripts,get_transcript):
    genome = SequenceFileDB('hg19.fa') #pip install bsddb3 is required
    try:
        chrom, offset, ref, alt = hgvs.parse_hgvs_name(variant, genome, get_transcript=get_transcript)
    except:
        return 1
    return chrom, offset, ref, alt

def readgenes(trans):
    genes=defaultdict(str)
    for fields in (x.rstrip('\r\n').split("\t") for x in ts.nopen(trans)):
        gene=fields[0]; transcript=fields[1]
        genes[gene]=transcript
    return genes

Beispiel #21

0

Datei anzeigen

Datei: functional_assays_to_vcf.py Projekt: BD2KGenomics/brca-exchange

def main():
    options = parse_args()
    inputFile = options.input
    annotFile_path = options.inAnnot
    vcfFile = options.out
    genome_path = options.gpath
    refseq_path = options.rpath
    source = options.source
    logfile = options.logfile

    if options.verbose:
        logging_level = logging.DEBUG
    else:
        logging_level = logging.CRITICAL

    logging.basicConfig(filename=logfile, filemode="w", level=logging_level)

    with open(refseq_path) as infile:
        transcripts = hgvs_utils.read_transcripts(infile)

    genome = SequenceFileDB(genome_path)

    def get_transcript(name):
        return transcripts.get(name)

    # open and store annotation fields in a dictionary
    annotDict = defaultdict()
    with open(annotFile_path) as inAnnotFile:
        for line in inAnnotFile:
            line = line.strip().split('\t')
            annotDict[line[0]] = line[1]

    # print header lines to vcf file
    print('##fileformat=VCFv4.0', file=vcfFile)
    print('##source={0}'.format(source), file=vcfFile)
    print('##reference=GRCh37', file=vcfFile)
    for annotation, description in annotDict.items():
        print('##INFO=<ID={0},Number=.,Type=String,Description="{1}">'.format(annotation.replace(' ', '_'), description), file=vcfFile)
    print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO', file=vcfFile)

    # extract INFO field column indicies for annotation terms
    headerline = inputFile.readline().strip().replace(' ', '_').replace('"', '').split('\t')

    fieldIdxDict = defaultdict()
    for index, field in enumerate(headerline):
        fieldIdxDict[field] = index

    # extract info from each line of the flat file
    for line in inputFile:
        line = line.replace('"', '')
        INFO_field = list()
        parsedLine = line.strip().split('\t')
        for field in headerline:
            field_index = fieldIdxDict[field]
            field_value = parsedLine[field_index]
            field_value = normalize(field, field_value)
            INFO_field.append('{0}={1}'.format(field, field_value))

        # extract hgvs cDNA term for variant and cleanup formatting
        hgvsName = parsedLine[fieldIdxDict['hgvs_nucleotide']]
        if hgvsName == '-':
            logging.debug("hgvs name == '-' for line: %s", parsedLine)
            continue
        gene_symbol = parsedLine[fieldIdxDict['gene_symbol']].lower()
        if gene_symbol == 'brca1':
            transcript = 'NM_007294.3'
        elif gene_symbol == 'brca2':
            transcript = 'NM_000059.3'
        else:
            logging.debug("improper gene symbol: %s", gene_symbol)
            continue
        queryHgvsName = transcript + ':' + hgvsName.rstrip().split(';')[0]
        INFO_field_string = ';'.join(INFO_field)
        try:
            chrom, offset, ref, alt = hgvs.parse_hgvs_name(queryHgvsName, genome, get_transcript=get_transcript)
            chrom = chrom.replace('chr', '')
            print('{0}\t{1}\t{2}\t{3}\t{4}\t.\t.\t{5}'.format(chrom, offset, queryHgvsName, ref, alt, INFO_field_string), file=vcfFile)
        except Exception as e:
            logging.debug("could not parse hgvs field: %s", queryHgvsName)

Beispiel #22

0

Datei anzeigen

Datei: HGVS_conversion.py Projekt: BD2KGenomics/brca-exchange

import pyhgvs
import pyhgvs.utils as pyhgvs_utils
from pygr.seqdb import SequenceFileDB
import sys
import os
import json
import string_comp
ORIGINAL_FILE = "../BRCA_selectedLabs_only/ClinVarBRCA.selectedLabsOnly.txt"


ERROR = "../BRCA_selectedLabs_only/BRCA.wrong_genome_Coor"
FILE = "../BRCA_selectedLabs_only/BRCA.pre-processed"

GENOME = SequenceFileDB("../reference_files/hg19.fa")
with open('../reference_files/genes.refGene.BRCA.txt') as infile:
    transcripts = pyhgvs_utils.read_transcripts(infile)
def get_transcript(name):
    return transcripts.get(name)

def main():
    check_HGVS_conversion_error()
    #HGVS_conversion()



def HGVS_to_genome_coor(HGVS):
    try:
        chrm, pos, ref, alt = pyhgvs.parse_hgvs_name(
            HGVS, GENOME, get_transcript=get_transcript)
        chrm = chrm[3:]
        pos = str(pos)

Beispiel #23

0

Datei anzeigen

  ('NM_000352.3', 'c', '>', CDNACoord(215, -10), CDNACoord(215, -10), 'A', 'G')

"""
from __future__ import print_function

from __future__ import unicode_literals
import pyhgvs as hgvs
import pyhgvs.utils as hgvs_utils
from pyfaidx import Fasta

# Read genome sequence using pyfaidx.
genome = Genome('/tmp/hg19.fa')

# Read RefSeq transcripts into a python dict.
with open('pyhgvs/data/genes.refGene') as infile:
    transcripts = hgvs_utils.read_transcripts(infile)


# Provide a callback for fetching a transcript by its name.
def get_transcript(name):
    return transcripts.get(name)


# Parse the HGVS name into genomic coordinates and alleles.
chrom, offset, ref, alt = hgvs.parse_hgvs_name('NM_000352.3:c.215A>G',
                                               genome,
                                               get_transcript=get_transcript)
print(chrom, offset, ref, alt)
# Returns variant in VCF style: ('chr11', 17496508, 'T', 'C')
# Notice that since the transcript is on the negative strand, the alleles
# are reverse complemented during conversion.

Beispiel #24

0

Datei anzeigen

Datei: brca_pseudonym_generator.py Projekt: BD2KGenomics/brca-exchange

def main(args):

    options = parse_args()
    brcaFile = options.inBRCA
    hg18_fa = options.inHg18
    hg19_fa = options.inHg19
    hg38_fa = options.inHg38
    refSeq18 = options.inRefSeq18
    refSeq19 = options.inRefSeq19
    refSeq38 = options.inRefSeq38
    outputFile = options.outBRCA
    calcProtein = options.calcProtein
    artifacts_dir = options.artifacts_dir

    if not os.path.exists(artifacts_dir):
        os.makedirs(artifacts_dir)
    log_file_path = artifacts_dir + "brca-pseudonym-generator.log"
    logging.basicConfig(filename=log_file_path, filemode="w", level=logging.DEBUG)

    hgvs_parser = hgvs.parser.Parser()
    hgvs_dp = hgvs.dataproviders.uta.connect()
    hgvs_norm = hgvs.normalizer.Normalizer(hgvs_dp)
    hgvs_am = hgvs.assemblymapper.AssemblyMapper(hgvs_dp, assembly_name='GRCh38')

    genome36 = SequenceFileDB(hg18_fa.name)
    genome37 = SequenceFileDB(hg19_fa.name)
    genome38 = SequenceFileDB(hg38_fa.name)

    transcripts36 = pyhgvs_utils.read_transcripts(refSeq18)
    transcripts37 = pyhgvs_utils.read_transcripts(refSeq19)
    transcripts38 = pyhgvs_utils.read_transcripts(refSeq38)

    def get_transcript36(name):
        return transcripts36.get(name)

    def get_transcript37(name):
        return transcripts37.get(name)

    def get_transcript38(name):
        return transcripts38.get(name)

    hgvsG36ColumnName = 'Genomic_Coordinate_hg36'
    hgvsG37ColumnName = 'Genomic_Coordinate_hg37'
    hgvsG38ColumnName = 'Genomic_Coordinate_hg38'
    refSeqColumnName = 'Reference_Sequence'
    hgvsCDNAColumnName = 'HGVS_cDNA'
    hgvsCDNALOVDColumnName = 'HGVS_cDNA_LOVD'
    hgvsPColumnName = 'HGVS_Protein'

    # Set up header for output file
    input_file = csv.reader(brcaFile, delimiter='\t')
    output_file = csv.writer(outputFile, delimiter='\t')
    input_header_row = input_file.next()

    # The following new columns will contain data generated by this file
    new_columns_to_append = ["pyhgvs_Genomic_Coordinate_36", "pyhgvs_Genomic_Coordinate_37",
                          "pyhgvs_Genomic_Coordinate_38", "pyhgvs_Hg37_Start", "pyhgvs_Hg37_End",
                          "pyhgvs_Hg36_Start", "pyhgvs_Hg36_End", "pyhgvs_cDNA", "pyhgvs_Protein"]

    output_header_row = input_header_row + new_columns_to_append

    output_file.writerow(output_header_row)

    # Store indexes of the relevant columns
    hgvsG36Index = input_header_row.index(hgvsG36ColumnName)
    hgvsG37Index = input_header_row.index(hgvsG37ColumnName)
    hgvsG38Index = input_header_row.index(hgvsG38ColumnName)
    refSeqIndex = input_header_row.index(refSeqColumnName)
    hgvsCDNAIndex = input_header_row.index(hgvsCDNAColumnName)
    hgvsPIndex = input_header_row.index(hgvsPColumnName)
    hgvsCDNALOVDIndex = input_header_row.index(hgvsCDNALOVDColumnName)
    geneSymbolIndex = input_header_row.index("Gene_Symbol")
    synonymIndex = input_header_row.index("Synonyms")

    refSeqBRCA1Transcripts = ['NM_007294.2', 'NM_007300.3', 'NM_007299.3', 'NM_007298.3', 'NM_007297.3', 'U14680.1']
    refSeqBRCA2Transcripts = ['U43746.1']

    for line in input_file:
        if line[geneSymbolIndex] == 'BRCA1':
            line[refSeqIndex] = 'NM_007294.3'
        elif line[geneSymbolIndex] == 'BRCA2':
            line[refSeqIndex] = 'NM_000059.3'

        # Store for reference and debugging
        oldHgvsGenomic38 = line[refSeqIndex] + ':' + line[hgvsG38Index].split(',')[0]

        chrom38 = line[input_header_row.index("Chr")]
        offset38 = line[input_header_row.index("Pos")]
        ref38 = line[input_header_row.index("Ref")]
        alt38 = line[input_header_row.index("Alt")]

        # Edge cases to correct variant string formats for indels in order to be accepted by the counsyl parser
        if ref38 == '-': ref38 = ''
        if alt38 == '-': alt38 = ''
        if alt38 == 'None': alt38 = ''
        transcript38 = get_transcript38(line[refSeqIndex])
        transcript37 = get_transcript37(line[refSeqIndex])
        transcript36 = get_transcript36(line[refSeqIndex])

        # Normalize hgvs cdna string to fit what the counsyl hgvs parser determines to be the correct format
        if transcript38 is None:
            print("ERROR: could not parse transcript38 for variant: %s \n" % (line))
            continue
        cdna_coord = str(pyhgvs.format_hgvs_name("chr" + chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
        chrom38, offset38, ref38, alt38 = pyhgvs.parse_hgvs_name(cdna_coord, genome38, get_transcript=get_transcript38)
        chrom37, offset37, ref37, alt37 = pyhgvs.parse_hgvs_name(cdna_coord, genome37, get_transcript=get_transcript37)
        chrom36, offset36, ref36, alt36 = pyhgvs.parse_hgvs_name(cdna_coord, genome36, get_transcript=get_transcript36)

        # Generate transcript hgvs cdna synonym string
        if line[synonymIndex] == "-":
            synonymString = []
        elif line[synonymIndex] == "":
            synonymString = []
        else:
            synonymString = line[synonymIndex].split(",")
        if line[geneSymbolIndex] == 'BRCA1':
            for transcriptName in refSeqBRCA1Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)
        elif line[geneSymbolIndex] == 'BRCA2':
            for transcriptName in refSeqBRCA2Transcripts:
                transcript38 = get_transcript38(transcriptName)
                cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38, int(offset38), ref38, alt38, genome38, transcript38, use_gene=False, max_allele_length=100))
                synonymString.append(cdna_synonym)

        # Add hgvs_cDNA values from LOVD to synonyms if not already present
        for cdna_coord_LOVD in line[hgvsCDNALOVDIndex].split(','):
            # Skip if blank
            if cdna_coord_LOVD == "-" or cdna_coord_LOVD is None or cdna_coord_LOVD == "":
                continue

            cdna_coord_LOVD = cdna_coord_LOVD.strip()

            # Don't add to synonyms if main hgvs_cDNA field is already equivalent to hgvs_cDNA value from LOVD
            cdna_coord_LOVD_for_comparison = cdna_coord_LOVD.split(':')[1]
            if cdna_coord_LOVD_for_comparison in line[hgvsCDNAIndex]:
                continue

            try:
                chrom38LOVD, offset38LOVD, ref38LOVD, alt38LOVD = pyhgvs.parse_hgvs_name(cdna_coord_LOVD, genome38, get_transcript=get_transcript38)
                if line[geneSymbolIndex] == 'BRCA1':
                    for transcriptName in refSeqBRCA1Transcripts:
                        transcript38 = get_transcript38(transcriptName)
                        cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                        if cdna_synonym not in synonymString:
                            synonymString.append(cdna_synonym)
                elif line[geneSymbolIndex] == 'BRCA2':
                    for transcriptName in refSeqBRCA2Transcripts:
                        transcript38 = get_transcript38(transcriptName)
                        cdna_synonym = str(pyhgvs.format_hgvs_name(chrom38LOVD, int(offset38LOVD), ref38LOVD, alt38LOVD, genome38, transcript38, use_gene=False, max_allele_length=100))
                        if cdna_synonym not in synonymString:
                            synonymString.append(cdna_synonym)
            except Exception as e:
                print('parse error: {}'.format(cdna_coord_LOVD))
                print(e)

        protein_coord = None
        if calcProtein:
            try:
                genomic_change = '{0}:g.{1}:{2}>{3}'.format(chrom38, offset38, ref38, alt38)
                var_c1 = hgvs_parser.parse_hgvs_variant(cdna_coord)
                var_c1_norm = hgvs_norm.normalize(var_c1) # doing normalization explicitly to get a useful error message
                protein_coord = hgvs_am.c_to_p(var_c1_norm)
            except Exception as e:
                template = "An error of type {0} occured. Arguments:{1!r}"
                error_name = type(e).__name__
                message = template.format(error_name, e.args)
                logging.error(message)
                logging.error('Proposed GRCh38 Genomic change for error: %s', genomic_change)
                logging.error(line)

                # Exceptions related to invalid data
                data_errors = set(['HGVSParseError', 'HGVSError', 'HGVSInvalidVariantError', 'HGVSUnsupportedOperationError'])
                if error_name not in data_errors:
                    # output some more if exception doesn't seem to be related to invalid data
                    logging.error("Non data error raised")
                    logging.exception(message)

                if error_name == "DatabaseError":
                    # Aborting, as it is a transient error in principle, i.e. in one run we might be able to obtain a protein change, in another not, messing up the data diffs
                    raise EnvironmentError("Issue with UTA database. Aborting")

        # Add empty data for each new column to prepare for data insertion by index
        for i in range(len(new_columns_to_append)):
            line.append('-')

        line[output_header_row.index("pyhgvs_Genomic_Coordinate_36")] = '{0}:g.{1}:{2}>{3}'.format(chrom36,offset36,ref36,alt36)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_37")] = '{0}:g.{1}:{2}>{3}'.format(chrom37,offset37,ref37,alt37)
        line[output_header_row.index("pyhgvs_Genomic_Coordinate_38")] = '{0}:g.{1}:{2}>{3}'.format(chrom38,offset38,ref38,alt38)
        line[output_header_row.index("pyhgvs_Hg37_Start")] = str(offset37)
        line[output_header_row.index("pyhgvs_Hg37_End")] = str(int(offset37) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_Hg36_Start")] = str(offset36)
        line[output_header_row.index("pyhgvs_Hg36_End")] = str(int(offset36) + len(ref38) - 1)
        line[output_header_row.index("pyhgvs_cDNA")] = '{0}'.format(cdna_coord)
        if calcProtein == True:
            line[output_header_row.index("pyhgvs_Protein")] = '{0}'.format(str(protein_coord))
        line[synonymIndex] = ','.join(synonymString)

        output_file.writerow(line)

    hg18_fa.close()
    hg19_fa.close()
    hg38_fa.close()
    refSeq18.close()
    refSeq19.close()
    refSeq38.close()