def main(inFileName, geneList=[]): dataH = {} # nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \ # 'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \ # 'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments') nameL = ('Gene name', 'Mutation CDS', 'Mutation AA', 'Mutation Description', 'Mutation GRCh37 genome position', 'Mutation GRCh37 strand', 'Mutation somatic status') inFile = open(inFileName) headerL = inFile.readline()[:-1].split('\t') idxH = dict([(x, headerL.index(x)) for x in nameL]) for line in inFile: valueL = line[:-1].split('\t') geneN = valueL[idxH['Gene name']] if '_ENST' in geneN: geneN = geneN.split('_ENST')[0] if len(geneList) > 0 and geneN not in geneList: continue coord = valueL[idxH['Mutation GRCh37 genome position']] if not coord: continue somatic = valueL[idxH['Mutation somatic status']] if not 'somatic' in somatic: continue (chrNum, chrSta, chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)', coord).groups() cds = valueL[idxH['Mutation CDS']] aa = valueL[idxH['Mutation AA']] desc = valueL[idxH['Mutation Description']] strand = valueL[idxH['Mutation GRCh37 strand']] rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)', cds) if rm: (ref, vtype, alt) = rm.groups() else: ref, alt = '', '' if strand == '-': ref = mybasic.rc(ref) alt = mybasic.rc(alt) chr = chrNum if chr == '23': chr = 'X' chrNum = 'X' elif chr == '24': chr = 'Y' chrNum = 'Y' elif chr == '25': chr = 'M' chrNum = 'M' # if vtype == 'del': # rm = re.search('([ACGT]+)', alt.upper()) # ## if deleted bases are specified # if alt != '' and rm: # ## check if deleted bases are the same as reference sequences at the location # new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref)) # if new_ref == alt.upper(): # chrSta = str(int(chrSta) - 1) # ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # ref = "".join(map(lambda x: x.rstrip().upper(), ref)) # alt = ref[0] key = (chrNum, chrSta, chrEnd, strand, ref, alt) if key in dataH: mybasic.pushHash(dataH[key], 'geneN', geneN) mybasic.pushHash(dataH[key], 'cds', cds) mybasic.pushHash(dataH[key], 'aa', aa) mybasic.pushHash(dataH[key], 'desc', desc) else: dataH[key] = { 'geneN': set([geneN]), 'cds': set([cds]), 'aa': set([aa]), 'desc': set([desc]) } for ((chrNum, chrSta, chrEnd, strand, ref, alt), infoH) in dataH.iteritems(): sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\ ','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
def parse_info(info, ref, indexH): itemL = info.split(',') resH = {} for item in itemL: arr = item.split('|') if arr[indexH['Feature_type']] == 'RegulatoryFeature': gene = '-' if gene not in resH: resH[gene] = {'ch_type':''} mybasic.pushHash(resH[gene], 'ch_type', arr[indexH['Consequence']]) mybasic.pushHash(resH[gene], 'strand', '*') elif arr[indexH['Feature_type']] == 'MotifFeature' and (arr[indexH['Consequence']] == 'TF_binding_site_variant' or 'TFBS_' in arr[indexH['Consequence']]): gene = '-' if gene not in resH: resH[gene] = {'ch_type':''} mybasic.pushHash(resH[gene], 'ch_type', arr[indexH['Consequence']]) mybasic.pushHash(resH[gene], 'strand', '*') elif arr[indexH['Feature_type']] == '' and (arr[indexH['Consequence']] == 'intergenic_variant'): gene = '-' if gene not in resH: resH[gene] = {'ch_type':''} mybasic.pushHash(resH[gene], 'ch_type', arr[indexH['Consequence']]) mybasic.pushHash(resH[gene], 'strand', '*') elif arr[indexH['Feature_type']] == 'Transcript': csq = arr[indexH['Consequence']] # if ('non_coding_exon_variant' in csq or 'nc_transcript_variant' in csq) and 'splice_' not in csq and 'miRNA' not in csq: # continue ## non-coding scripts other than miRNA if ('upstream_gene_variant' in csq or 'downstream_gene_variant' in csq) and 'splice_' not in csq: gene = '-' if gene not in resH: resH[gene] = {'ch_type':''} ## treat up-, down-stream gene variants as intergenic mybasic.pushHash(resH[gene], 'strand', '*') mybasic.pushHash(resH[gene], 'ch_type', 'intergenic_variant') continue # if 'intron_variant' in csq and 'splice_' not in csq: # continue ## intron csq = shorten_csq(csq) gene = arr[indexH['SYMBOL']] if gene not in resH: resH[gene] = {'ch_type':''} ch_dna = arr[indexH['HGVSc']] if len(ch_dna.split(':')) > 1: ch_dna = ch_dna.split(':')[1] prot_pos = arr[indexH['Protein_position']] aa = arr[indexH['Amino_acids']] if arr[indexH['CANONICAL']] != 'YES': aa = '' if len(aa) < 1: # if len(prot_pos) > 0: # print info # print 'AA:%s' % item # raise Exception ch_aa = '' elif '/' not in aa: # elif len(aa) < 2: ch_aa = 'p.%s%s%s' % (aa, prot_pos, aa) else: (aa1, aa2) = aa.split('/') ch_aa = 'p.%s%s%s' % (aa1, prot_pos, aa2) codon = arr[indexH['Codons']] if len(codon) > 0: (nt1,nt2) = re.match('[nacgt]*([-ACGT]*)[nacgt]*/[nacgt]*([-ACGT]*)[nacgt]*', codon).group(1,2) if ref != nt1: strand = '-' else: strand = '+' if 'strand' in resH[gene]: try: resH[gene]['strand'].remove('*') except: pass mybasic.pushHash(resH[gene], 'strand', strand) else: if 'strand' not in resH[gene]: mybasic.pushHash(resH[gene], 'strand', '*') if len(csq) > 0: if len(arr[indexH['CANONICAL']]) > 0: mybasic.pushHash(resH[gene], 'ch_type_C', csq) mybasic.pushHash(resH[gene], 'ch_type', csq) if len(ch_aa) > 0 and 'nc_transcript_' not in csq and 'non_coding_' not in csq: if len(arr[indexH['CANONICAL']]) > 0: mybasic.pushHash(resH[gene], 'ch_aa_C', ch_aa) mybasic.pushHash(resH[gene],'ch_aa', ch_aa) if len(ch_dna) > 0 and 'nc_transcript_' not in csq and 'non_coding_' not in csq: if len(arr[indexH['CANONICAL']]) > 0: mybasic.pushHash(resH[gene], 'ch_dna_C', ch_dna) mybasic.pushHash(resH[gene], 'ch_dna', ch_dna) return resH
def main(inFileName,geneList=[]): dataH = {} # nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \ # 'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \ # 'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments') nameL = ('Gene name','Mutation CDS','Mutation AA','Mutation Description','Mutation GRCh37 genome position','Mutation GRCh37 strand','Mutation somatic status') inFile = open(inFileName) headerL = inFile.readline()[:-1].split('\t') idxH = dict([(x, headerL.index(x)) for x in nameL]) for line in inFile: valueL = line[:-1].split('\t') geneN = valueL[idxH['Gene name']] if '_ENST' in geneN: geneN = geneN.split('_ENST')[0] if len(geneList)>0 and geneN not in geneList: continue coord = valueL[idxH['Mutation GRCh37 genome position']] if not coord: continue somatic = valueL[idxH['Mutation somatic status']] if not 'somatic' in somatic: continue (chrNum,chrSta,chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)', coord).groups() cds = valueL[idxH['Mutation CDS']] aa = valueL[idxH['Mutation AA']] desc = valueL[idxH['Mutation Description']] strand = valueL[idxH['Mutation GRCh37 strand']] rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)',cds) if rm: (ref,vtype,alt) = rm.groups() else: ref,alt = '','' if strand == '-': ref = mybasic.rc(ref) alt = mybasic.rc(alt) chr = chrNum if chr == '23': chr = 'X' chrNum = 'X' elif chr == '24': chr = 'Y' chrNum = 'Y' elif chr == '25': chr = 'M' chrNum = 'M' # if vtype == 'del': # rm = re.search('([ACGT]+)', alt.upper()) # ## if deleted bases are specified # if alt != '' and rm: # ## check if deleted bases are the same as reference sequences at the location # new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref)) # if new_ref == alt.upper(): # chrSta = str(int(chrSta) - 1) # ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:] # ref = "".join(map(lambda x: x.rstrip().upper(), ref)) # alt = ref[0] key = (chrNum,chrSta,chrEnd,strand,ref,alt) if key in dataH: mybasic.pushHash(dataH[key],'geneN',geneN) mybasic.pushHash(dataH[key],'cds',cds) mybasic.pushHash(dataH[key],'aa',aa) mybasic.pushHash(dataH[key],'desc',desc) else: dataH[key] = {'geneN':set([geneN]), 'cds':set([cds]), 'aa':set([aa]), 'desc':set([desc])} for ((chrNum,chrSta,chrEnd,strand,ref,alt),infoH) in dataH.iteritems(): sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\ ','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
def parse_info_old(info, indexH): itemL = info.split(',') resH = {} for item in itemL: arr = item.split('|') if arr[indexH['Consequence']] == 'regulatory_region_variant': ## exclude variants in regulatory region (for now) continue key = arr[indexH['Gene']] tid = arr[indexH['Feature']] if key in resH.keys(): if tid != '': if tid in resH[key]['tid']: ##something wrong!! print info sys.exit(1) else: tnum = len(resH[key]['tid']) + 1 resH[key]['tid'][tid] = str(tnum)+":"+tid else: tnum = 1 ## for k in indexH.keys(): if arr[indexH[k]] != '': if k in excField: continue elif k in sglField: mybasic.pushHash(resH[key], k, arr[indexH[k]]) else: if tid == '': mybasic.pushHash(resH[key], k, arr[indexH[k]]) else: if k == 'HGVSc' or k == 'HGVSp': val = arr[indexH[k]].split(':')[1] mybasic.pushHash(resH[key], k, str(tnum)+"="+val) else: mybasic.pushHash(resH[key], k, str(tnum)+"="+arr[indexH[k]]) else: resH[key] = {} resH[key]['tid'] = {} if tid != '': tnum = len(resH[key]['tid'])+1 resH[key]['tid'][tid] = str(tnum)+":"+tid for k in indexH.keys(): if arr[indexH[k]] != '': if k in excField: continue elif k in sglField: mybasic.pushHash(resH[key], k, arr[indexH[k]]) else: if tid == '': mybasic.pushHash(resH[key], k, arr[indexH[k]]) else: if k == 'HGVSc' or k == 'HGVSp': val = arr[indexH[k]].split(':')[1] mybasic.pushHash(resH[key], k, str(tnum)+"="+val) else: mybasic.pushHash(resH[key], k, str(tnum)+"="+arr[indexH[k]]) return resH