Beispiel #1
0
def main(inFileName, geneList=[]):

    dataH = {}

    #	nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \
    #		'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \
    #		'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments')

    nameL = ('Gene name', 'Mutation CDS', 'Mutation AA',
             'Mutation Description', 'Mutation GRCh37 genome position',
             'Mutation GRCh37 strand', 'Mutation somatic status')

    inFile = open(inFileName)

    headerL = inFile.readline()[:-1].split('\t')

    idxH = dict([(x, headerL.index(x)) for x in nameL])

    for line in inFile:

        valueL = line[:-1].split('\t')

        geneN = valueL[idxH['Gene name']]

        if '_ENST' in geneN:
            geneN = geneN.split('_ENST')[0]

        if len(geneList) > 0 and geneN not in geneList:
            continue

        coord = valueL[idxH['Mutation GRCh37 genome position']]

        if not coord:
            continue

        somatic = valueL[idxH['Mutation somatic status']]

        if not 'somatic' in somatic:
            continue

        (chrNum, chrSta, chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)',
                                             coord).groups()

        cds = valueL[idxH['Mutation CDS']]
        aa = valueL[idxH['Mutation AA']]
        desc = valueL[idxH['Mutation Description']]
        strand = valueL[idxH['Mutation GRCh37 strand']]

        rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)',
                      cds)

        if rm:
            (ref, vtype, alt) = rm.groups()
        else:
            ref, alt = '', ''

        if strand == '-':
            ref = mybasic.rc(ref)
            alt = mybasic.rc(alt)

        chr = chrNum
        if chr == '23':
            chr = 'X'
            chrNum = 'X'
        elif chr == '24':
            chr = 'Y'
            chrNum = 'Y'
        elif chr == '25':
            chr = 'M'
            chrNum = 'M'


#		if vtype == 'del':
#			rm = re.search('([ACGT]+)', alt.upper())
#			## if deleted bases are specified
#			if alt != '' and rm:
#				## check if deleted bases are the same as reference sequences at the location
#				new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#				new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref))
#				if new_ref == alt.upper():
#					chrSta = str(int(chrSta) - 1)
#					ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#					ref = "".join(map(lambda x: x.rstrip().upper(), ref))
#					alt = ref[0]

        key = (chrNum, chrSta, chrEnd, strand, ref, alt)

        if key in dataH:
            mybasic.pushHash(dataH[key], 'geneN', geneN)
            mybasic.pushHash(dataH[key], 'cds', cds)
            mybasic.pushHash(dataH[key], 'aa', aa)
            mybasic.pushHash(dataH[key], 'desc', desc)
        else:
            dataH[key] = {
                'geneN': set([geneN]),
                'cds': set([cds]),
                'aa': set([aa]),
                'desc': set([desc])
            }

    for ((chrNum, chrSta, chrEnd, strand, ref, alt),
         infoH) in dataH.iteritems():

        sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\
         ','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
Beispiel #2
0
def parse_info(info, ref, indexH):
	itemL = info.split(',')
	resH = {}
	for item in itemL:
		arr = item.split('|')
		if arr[indexH['Feature_type']] == 'RegulatoryFeature':
			gene = '-'
			if gene not in resH:
				resH[gene] = {'ch_type':''}
			mybasic.pushHash(resH[gene], 'ch_type', arr[indexH['Consequence']])
			mybasic.pushHash(resH[gene], 'strand', '*')

		elif arr[indexH['Feature_type']] == 'MotifFeature' and (arr[indexH['Consequence']] == 'TF_binding_site_variant' or 'TFBS_' in arr[indexH['Consequence']]):
			gene = '-'
			if gene not in resH:
				resH[gene] = {'ch_type':''}
			mybasic.pushHash(resH[gene], 'ch_type', arr[indexH['Consequence']])
			mybasic.pushHash(resH[gene], 'strand', '*')

		elif arr[indexH['Feature_type']] == '' and (arr[indexH['Consequence']] == 'intergenic_variant'):
			gene = '-'
			if gene not in resH:
				resH[gene] = {'ch_type':''}
			mybasic.pushHash(resH[gene], 'ch_type', arr[indexH['Consequence']])
			mybasic.pushHash(resH[gene], 'strand', '*')

		elif arr[indexH['Feature_type']] == 'Transcript':
			csq = arr[indexH['Consequence']]
#			if ('non_coding_exon_variant' in csq or 'nc_transcript_variant' in csq) and 'splice_' not in csq and 'miRNA' not in csq:
#				continue ## non-coding scripts other than miRNA
			if ('upstream_gene_variant' in csq or 'downstream_gene_variant' in csq) and 'splice_' not in csq:
				gene = '-'
				if gene not in resH:
					resH[gene] = {'ch_type':''}
				## treat up-, down-stream gene variants as intergenic
				mybasic.pushHash(resH[gene], 'strand', '*')
				mybasic.pushHash(resH[gene], 'ch_type', 'intergenic_variant')
				continue
#			if 'intron_variant' in csq and 'splice_' not in csq:
#				continue ## intron

			csq = shorten_csq(csq)

			gene = arr[indexH['SYMBOL']]
			if gene not in resH:
				resH[gene] = {'ch_type':''}
			ch_dna = arr[indexH['HGVSc']]
			if len(ch_dna.split(':')) > 1:
				ch_dna = ch_dna.split(':')[1]
			prot_pos = arr[indexH['Protein_position']]
			aa = arr[indexH['Amino_acids']]
			if arr[indexH['CANONICAL']] != 'YES':
				aa = ''
			if len(aa) < 1:
#				if len(prot_pos) > 0:
#					print info
#					print 'AA:%s' % item
#					raise Exception
				ch_aa = ''
			elif '/' not in aa:
#			elif len(aa) < 2:
				ch_aa = 'p.%s%s%s' % (aa, prot_pos, aa)
			else:
				(aa1, aa2) = aa.split('/')
				ch_aa = 'p.%s%s%s' % (aa1, prot_pos, aa2)
			codon = arr[indexH['Codons']]
			if len(codon) > 0:
				(nt1,nt2) = re.match('[nacgt]*([-ACGT]*)[nacgt]*/[nacgt]*([-ACGT]*)[nacgt]*', codon).group(1,2)
				if ref != nt1:
					strand = '-'
				else:
					strand = '+'
				if 'strand' in resH[gene]:
					try:
						resH[gene]['strand'].remove('*')
					except:
						pass
				mybasic.pushHash(resH[gene], 'strand', strand)
			else:
				if 'strand' not in resH[gene]:
					mybasic.pushHash(resH[gene], 'strand', '*')
			if len(csq) > 0:
				if len(arr[indexH['CANONICAL']]) > 0:
					mybasic.pushHash(resH[gene], 'ch_type_C', csq)
				mybasic.pushHash(resH[gene], 'ch_type', csq)
			if len(ch_aa) > 0 and 'nc_transcript_' not in csq and 'non_coding_' not in csq:
				if len(arr[indexH['CANONICAL']]) > 0:
					mybasic.pushHash(resH[gene], 'ch_aa_C', ch_aa)
				mybasic.pushHash(resH[gene],'ch_aa', ch_aa)
			if len(ch_dna) > 0 and 'nc_transcript_' not in csq and 'non_coding_' not in csq:
				if len(arr[indexH['CANONICAL']]) > 0:
					mybasic.pushHash(resH[gene], 'ch_dna_C', ch_dna)
				mybasic.pushHash(resH[gene], 'ch_dna', ch_dna)

	return resH
Beispiel #3
0
def main(inFileName,geneList=[]):

	dataH = {}

#	nameL = ('Mutation GRCh37 genome position', 'Mutation GRCh37 strand','Gene name','ID_sample','ID_tumour','Primary site', \
#		'Site subtype','Primary histology','Histology subtype','Genome-wide screen','Mutation ID','Mutation CDS','Mutation AA', \
#		'Mutation Description','Mutation zygosity','Mutation somatic status','Pubmed_PMID','Sample source','Tumor origin','Comments')

	nameL = ('Gene name','Mutation CDS','Mutation AA','Mutation Description','Mutation GRCh37 genome position','Mutation GRCh37 strand','Mutation somatic status')

	inFile = open(inFileName)

	headerL = inFile.readline()[:-1].split('\t')

	idxH = dict([(x, headerL.index(x)) for x in nameL])

	for line in inFile:

		valueL = line[:-1].split('\t')

		geneN = valueL[idxH['Gene name']]

		if '_ENST' in geneN:
			geneN = geneN.split('_ENST')[0]

		if len(geneList)>0 and geneN not in geneList:
			continue

		coord = valueL[idxH['Mutation GRCh37 genome position']]	

		if not coord:
			continue

		somatic = valueL[idxH['Mutation somatic status']]	

		if not 'somatic' in somatic:
			continue

		(chrNum,chrSta,chrEnd) = re.search('([^:-]+):([^:-]+)-([^:-]+)', coord).groups()

		cds = valueL[idxH['Mutation CDS']]	
		aa = valueL[idxH['Mutation AA']]	
		desc = valueL[idxH['Mutation Description']]	
		strand = valueL[idxH['Mutation GRCh37 strand']]	

		rm = re.match('c\.[\+\-_0-9]+([atgcATGC]*)(>|ins|del)([atgcATGC]*)',cds)

		if rm:
			(ref,vtype,alt) = rm.groups()
		else:
			ref,alt = '',''

		if strand == '-':
			ref = mybasic.rc(ref)
			alt = mybasic.rc(alt)

		chr = chrNum
		if chr == '23':
			chr = 'X'
			chrNum = 'X'
		elif chr == '24':
			chr = 'Y'
			chrNum = 'Y'
		elif chr == '25':
			chr = 'M'
			chrNum = 'M'

#		if vtype == 'del':
#			rm = re.search('([ACGT]+)', alt.upper())
#			## if deleted bases are specified
#			if alt != '' and rm:
#				## check if deleted bases are the same as reference sequences at the location
#				new_ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#				new_ref = "".join(map(lambda x: x.rstrip().upper(), new_ref))
#				if new_ref == alt.upper():
#					chrSta = str(int(chrSta) - 1)
#					ref = os.popen('samtools faidx /data1/Sequence/ucsc_hg19/hg19.fa chr%s:%s-%s' % (chr,chrSta,chrEnd)).readlines()[1:]
#					ref = "".join(map(lambda x: x.rstrip().upper(), ref))
#					alt = ref[0]

		key = (chrNum,chrSta,chrEnd,strand,ref,alt)

		if key in dataH:
			mybasic.pushHash(dataH[key],'geneN',geneN)
			mybasic.pushHash(dataH[key],'cds',cds)
			mybasic.pushHash(dataH[key],'aa',aa)
			mybasic.pushHash(dataH[key],'desc',desc)
		else:
			dataH[key] = {'geneN':set([geneN]), 'cds':set([cds]), 'aa':set([aa]), 'desc':set([desc])}

	for ((chrNum,chrSta,chrEnd,strand,ref,alt),infoH) in dataH.iteritems():

		sys.stdout.write('chr%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (chrNum,chrSta,chrEnd,strand, ref,alt,\
			','.join(filter(lambda x: not x.startswith('ENSG'), list(infoH['geneN']))), ','.join(infoH['cds']), ','.join(infoH['aa']), ','.join(infoH['desc'])))
Beispiel #4
0
def parse_info_old(info, indexH):
	itemL = info.split(',')
	resH = {}
	for item in itemL:
		arr = item.split('|')
		if arr[indexH['Consequence']] == 'regulatory_region_variant': ## exclude variants in regulatory region (for now)
			continue
		key = arr[indexH['Gene']]
		tid = arr[indexH['Feature']]
		if key in resH.keys():
			if tid != '':
				if tid in resH[key]['tid']:
					##something wrong!!
					print info
					sys.exit(1)
				else:
					tnum = len(resH[key]['tid']) + 1
					resH[key]['tid'][tid] = str(tnum)+":"+tid
			else:
				tnum = 1
			##
			for k in indexH.keys():
				if arr[indexH[k]] != '':
					if k in excField:
						continue
					elif k in sglField:
						mybasic.pushHash(resH[key], k, arr[indexH[k]])
					else:
						if tid == '':
							mybasic.pushHash(resH[key], k, arr[indexH[k]])
						else:
							if k == 'HGVSc' or k == 'HGVSp':
								val = arr[indexH[k]].split(':')[1]
								mybasic.pushHash(resH[key], k, str(tnum)+"="+val)
							else:
								mybasic.pushHash(resH[key], k, str(tnum)+"="+arr[indexH[k]])
		else:
			resH[key] = {}
			resH[key]['tid'] = {}
			if tid != '':
				tnum = len(resH[key]['tid'])+1
				resH[key]['tid'][tid] = str(tnum)+":"+tid
			for k in indexH.keys():
				if arr[indexH[k]] != '':
					if k in excField:
						continue
					elif k in sglField:
						mybasic.pushHash(resH[key], k, arr[indexH[k]])
					else:
						if tid == '':
							mybasic.pushHash(resH[key], k, arr[indexH[k]])
						else:
							if k == 'HGVSc' or k == 'HGVSp':
								val = arr[indexH[k]].split(':')[1]
								mybasic.pushHash(resH[key], k, str(tnum)+"="+val)
							else:
								mybasic.pushHash(resH[key], k, str(tnum)+"="+arr[indexH[k]])
	return resH