def averagesFromFile(VEPannotatedVCF, column2retain , lSOTerm): """ VEPannotatedVCF= vcf annotated using VEP column2retain= list, index of samples to consider lSOTerm= list of Sequence Ontolgy terms that define variant consequences """ dInfo={}; dSOT={}; dImpact={} for line in gzip.open(args.f, 'r'): decodedLine=line.decode() ## line.decode() is necessary to read encoded data using gzip in python3 if re.match('##', decodedLine): if re.search('ID=CSQ' , decodedLine ): csqHeader=decodedLine.rstrip().split(':')[1].lstrip().rstrip('\'>').split('|') elif re.match('#', decodedLine): for ind in sampleToConsider: column2retain.append(decodedLine.split().index(ind)) else: linesplit=decodedLine.rstrip().split() mychr=linesplit[0]; mypos=linesplit[1]; myref=linesplit[3]; myalt=linesplit[4] ## basic info ##~~ split INFO field tempinfo=linesplit[7] for i in tempinfo.split(';'): if re.search('=', i): # check if INFO fields has a value corresponding temp=i.split('=') dInfo[temp[0]]=temp[1] else: pass ##~~ work on dInfo[CSQ] ##~~ split for multiple consequences separated by ',' multipleCsq=dInfo['CSQ'].split(',') ##~~ single consequence #print ('~~~ this is a consequence in a line ') for mcsq in multipleCsq: dCsq=dict(zip(csqHeader, mcsq.split('|') )) ############# ALL VEP INFO #~~~~~~~~~~~ identify the allele with consequences mycsqAllele=dCsq['Allele'] GTfields=[] for col in range(args.n): GTfields+=[linesplit[column2retain[col]]] #nbAploidSamples=len(GTfields)*2 freqCSQ_REF_ALT=gp.AnnotateFreqCSQ_REF_ALT(mycsqAllele,myref, myalt, GTfields) # calculate allelic frequencies for cons in dCsq['Consequence'].split('&'): #~~~~~~~~~~~~ assign severity score at the most severe csq if freqCSQ_REF_ALT[0]!='NA': if not dCsq['Consequence'] in dSOT: dSOT[dCsq['Consequence']]=[0,0] #inizialize di dictionary with [counter, allele freq] if the key is not present dSOT[dCsq['Consequence']][0]+=1 #add +1 to the counter dSOT[dCsq['Consequence']][1]+=float(freqCSQ_REF_ALT[0]) #add the value of the consequence allele else: listOfErrors.append((mychr, mypos,myref, myalt, dCsq["Allele"]) ) #to be printed in the error file to compare allele matching CsqMeans=[mychr,freqCSQ_REF_ALT[4]] for vcsq in lSOTerm: if vcsq in dSOT: CsqMeans.append(dSOT[vel][1]/float(dSOT[vel][0])) else: CsqMeans.append('na') return CsqMeans
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", help="path to input file ", type=str, required=True) parser.add_argument("-v", help="path to table of vep consequences ", type=str, required=True) parser.add_argument("-o", help="path to output file ", type=str, required=True) parser.add_argument("-e", help="path to error file", type=str, required=True) #parser.add_argument("-m", help="path to metadata file",type=str,required=True) args = parser.parse_args() #output = open(args.o,'w') #print(args) ############################################################# ## READ VEP consequences rank ######## """read external file with info on VEP consequences """ dRank = { "HIGH": "HIGH", "LOW": "LOW", "MODERATE": "MODERATE", "MODIFIER": "MODIFIER" } dSOTermRank = {} lSOTerm = [] ### list of SOTerm ordered by severity countlinesCsq = True for csqLine in open(args.v, 'r'): if countlinesCsq: csqTitle = csqLine.rstrip().split('\t') countlinesCsq = False else: myRowList = csqLine.rstrip().split('\t') dCsq = dict(zip(csqTitle, myRowList)) dSOTermRank[dCsq['SO term']] = dRank[dCsq['IMPACT']] lSOTerm.append(myRowList[0]) ##########~~~~~~~~~~~~~~ Read metadata and randomize sample to choose # Region=[] # Sample=[] # # for line in open(args.m, 'r'): # if re.match('sample', line): # header= line.rstrip().split() # else: # other=line.rstrip().split() # dMeta= dict(zip(header, other)) # # Region.append(dMeta['region']) # Sample.append(dMeta['sample']) # # dSampleRegion=dict(zip(Sample, Region)) # # EUR = [key for (key, value) in dSampleRegion.items() if value == 'EUROPE'] # random.seed(899) # # sampleToConsider=random.sample(EUR, 6) ##########~~~~~~~~~~~~~~ Loop of vcf lines sys.stdout = open(args.o, 'w') listOfErrors = [] dInfo = {} #column2retain=[] print("Chr", "\t", "Pos", "\t", "VariantClass", "\t", "CSQallele", "\t", "CSQrank", "\t", "Consequence", "\t", "CSQfreq", "\t", "REFfreq", "\t", "ALTfreq", "\t", "MAF", "\t", "Population") for line in gzip.open(args.f, 'r'): decodedLine = line.decode( ) ## line.decode() is necessary to read encoded data using gzip in python3 if re.match('#', decodedLine): if re.search("ID=CSQ", decodedLine): csqHeader = decodedLine.rstrip().split(":")[1].lstrip().rstrip( "\">").split("|") #filemyres.write(decodedLine) # elif re.match('#', decodedLine): # for ind in sampleToConsider: # column2retain.append(decodedLine.split().index(ind)) else: linesplit = decodedLine.rstrip().split() mychr = linesplit[0] mypos = linesplit[1] myref = linesplit[3] myalt = linesplit[4] ## basic info ##~~ split INFO field tempinfo = linesplit[7] for i in tempinfo.split(";"): if re.search('=', i): temp = i.split("=") dInfo[temp[0]] = temp[1] else: pass ##~~ work on dInfo[CSQ] ##~~ split for multiple consequences separated by "," multipleCsq = dInfo["CSQ"].split(",") ##~~ single consequence #print ('~~~ this is a consequence in a line ') CSQcount = 0 for mcsq in multipleCsq: CSQcount += 1 dCsq = dict(zip( csqHeader, mcsq.split("|"))) ############# ALL VEP INFO #~~~~~~~~~~~ identify the allele with consequences mycsqAllele = dCsq["Allele"] GTfields = linesplit[9:] nbAploidSamples = len(GTfields) * 2 freqCSQ_REF_ALT = gp.AnnotateFreqCSQ_REF_ALT( mycsqAllele, myref, myalt, nbAploidSamples, GTfields) for c in dCsq['Consequence'].split("&"): #~~~~~~~~~~~~ assign severity score at the most severe csq myindexes = [] myindexes.append(lSOTerm.index(c)) mostSevereCsq = lSOTerm[min(myindexes)] linesplit[7] = tempinfo # reset info field print(linesplit[0], "\t", linesplit[1], "\t", dCsq["VARIANT_CLASS"], "\t", dCsq["Allele"], "\t", dSOTermRank[mostSevereCsq], "\t", c, "\t", freqCSQ_REF_ALT[0], "\t", freqCSQ_REF_ALT[1], "\t", freqCSQ_REF_ALT[2], "\t", freqCSQ_REF_ALT[3], "\t", "GREP") #fileToWrite=open(args.e, 'w') #for i in listOfErrors: fileToWrite.write( i ) sys.stdout.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", help="path to input file ", type=str, required=True) parser.add_argument("-v", help="path to table of vep consequences ", type=str, required=True) parser.add_argument("-o", help="path to output file ", type=str, required=True) parser.add_argument("-e", help="path to error file", type=str, required=True) args = parser.parse_args() #output = open(args.o,'w') #print(args) ############################################################# ## READ VEP consequences rank ######## """read external file with info on VEP consequences """ dRank = { "HIGH": "HIGH", "LOW": "LOW", "MODERATE": "MODERATE", "MODIFIER": "MODIFIER" } dSOTermRank = {} lSOTerm = [] ### list of SOTerm ordered by severity countlinesCsq = True for csqLine in open(args.v, 'r'): if countlinesCsq: csqTitle = csqLine.rstrip().split('\t') countlinesCsq = False else: myRowList = csqLine.rstrip().split('\t') dCsq = dict(zip(csqTitle, myRowList)) dSOTermRank[dCsq['SO term']] = dRank[dCsq['IMPACT']] lSOTerm.append(myRowList[0]) ##########~~~~~~~~~~~~~~ Loop of vcf lines filemyres = open(args.o, 'w') listOfErrors = [] dInfo = {} for line in gzip.open(args.f, 'r'): decodedLine = line.decode( ) ## line.decode() is necessary to read encoded data using gzip in python3 if re.match('##', decodedLine): if re.search("ID=CSQ", decodedLine): csqHeader = decodedLine.rstrip().split(":")[1].lstrip().rstrip( "\">").split("|") filemyres.write(decodedLine) elif re.match('#', decodedLine): filemyres.write( '##INFO=<ID=CSQfreq,Number=A,Type=Float,Description="Frequency of CSQ allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=REFfreq,Number=A,Type=Float,Description="Frequency of REF allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=ALTfreq,Number=A,Type=Float,Description="Frequency of ALT allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=MAF,Number=A,Type=Float,Description="Frequency of Minor allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=nbCSQ,Number=A,Type=Integer,Description="1st, 2nd .... CSQ allele">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=CSQallele,Number=A,Type=String,Description="Describe CSQ allele">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=IMPACT,Number=A,Type=String,Description="Impact of CSQ allele">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=Existing_variation,Number=A,Type=String,Description="RS of CSQallele">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=VARIANT_CLASS,Number=A,Type=String,Description="type of variant : MODIFIER ecc..">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=Consequence,Number=A,Type=String,Description="type of variant : SNP, indels, ecc...">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=CSQrank,Number=A,Type=String,Description="Rank of Consequence: HIGH, MODERATE, LOW, MODIFIER">' ) filemyres.write("\n") filemyres.write(decodedLine) else: #print("this is a new line ") ## line split by tab linesplit = decodedLine.rstrip().split() #print(linesplit) mychr = linesplit[0] mypos = linesplit[1] myref = linesplit[3] myalt = linesplit[4] ## basic info ##~~ split INFO field tempinfo = linesplit[7] for i in tempinfo.split(";"): temp = i.split("=") dInfo[temp[0]] = temp[1] ##~~ work on dInfo[CSQ] ##~~ split for multiple consequences separated by "," multipleCsq = dInfo["CSQ"].split(",") ##~~ single consequence #print ('~~~ this is a consequence in a line ') CSQcount = 0 for mcsq in multipleCsq: CSQcount += 1 myres = [] #myres+=[mychr, mypos] dCsq = dict(zip( csqHeader, mcsq.split("|"))) ############# ALL VEP INFO #print (dCsq) #myres.append(dCsq['Existing_variation']) #~~~~~~~~~~~ identify the allele with consequences #linesplit[7]=tempinfo # reset info field mycsqAllele = dCsq["Allele"] GTfields = linesplit[9:] nbAploidSamples = len(GTfields) * 2 freqCSQ_REF_ALT = gp.AnnotateFreqCSQ_REF_ALT( mycsqAllele, myref, myalt, nbAploidSamples, GTfields) #print(freqCSQ_REF_ALT) #print (dCsq['Consequence'].split("&")) for c in dCsq['Consequence'].split("&"): #~~~~~~~~~~~~ assign severity score at the most severe csq myindexes = [] myindexes.append(lSOTerm.index(c)) mostSevereCsq = lSOTerm[min(myindexes)] linesplit[7] = tempinfo # reset info field linesplit[7] += ";CSQallele=" linesplit[7] += dCsq["Allele"] linesplit[7] += ";Consequence=" linesplit[7] += c linesplit[7] += ";CSQrank=" linesplit[7] += dSOTermRank[mostSevereCsq] #linesplit[7]+=";IMPACT=" #linesplit[7]+=dCsq["IMPACT"] linesplit[7] += ";ExistingVariation=" linesplit[7] += dCsq["Existing_variation"] linesplit[7] += ";VariantClass=" linesplit[7] += dCsq["VARIANT_CLASS"] linesplit[ 7] += ";nbCSQ=" # for specify the number of CSQ allele linesplit[7] += str( CSQcount) # for specify the number of CSQ allele linesplit[7] += ";CSQfreq=%s" % (freqCSQ_REF_ALT[0]) linesplit[7] += ";REFfreq=%s" % (freqCSQ_REF_ALT[1]) linesplit[7] += ";ALTfreq=%s" % (freqCSQ_REF_ALT[2]) linesplit[7] += ";MAF=%s" % (freqCSQ_REF_ALT[3]) filemyres.write("\t".join(linesplit)) filemyres.write("\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", help="path to input file ", type=str, required=True) parser.add_argument("-o", help="path to output file ", type=str, required=True) parser.add_argument("-e", help="path to error file", type=str, required=True) args = parser.parse_args() #output = open(args.o,'w') #print(args) ##########~~~~~~~~~~~~~~ Loop of vcf lines filemyres = open(args.o, 'w') listOfErrors = [] dInfo = {} for line in gzip.open(args.f, 'r'): decodedLine = line.decode( ) ## line.decode() is necessary to read encoded data using gzip in python3 if re.match('##', decodedLine): if re.search("ID=CSQ", decodedLine): csqHeader = decodedLine.rstrip().split(":")[1].lstrip().rstrip( "\">").split("|") filemyres.write(decodedLine) elif re.match('#', decodedLine): filemyres.write( '##INFO=<ID=CSQfreq,Number=A,Type=Float,Description="Frequency of CSQ allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=REFfreq,Number=A,Type=Float,Description="Frequency of REF allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=ALTfreq,Number=A,Type=Float,Description="Frequency of ALT allele in set of samples">' ) filemyres.write("\n") filemyres.write( '##INFO=<ID=nbCSQ,Number=A,Type=Integer,Description="1st, 2nd .... CSQ allele">' ) filemyres.write("\n") filemyres.write(decodedLine) else: #print("this is a new line ") ## line split by tab linesplit = decodedLine.rstrip().split() #print(linesplit) mychr = linesplit[0] mypos = linesplit[1] myref = linesplit[3] myalt = linesplit[4] ## basic info #nbOfAltAlleles=len(myalt.split(",")) #if nbOfAltAlleles> 2: #~~ excludes cases with more than two alt allele, want to add the excluded in the error output #listOfErrors+=( '\t'.join([mychr, mypos, 'more than two alternate alleles', nbOfAltAlleles , '\n'])) #pass #else: ##~~ split INFO field tempinfo = linesplit[7] for i in tempinfo.split(";"): temp = i.split("=") dInfo[temp[0]] = temp[1] ##~~ work on dInfo[CSQ] ##~~ split for multiple consequences separated by "," multipleCsq = dInfo["CSQ"].split(",") ##~~ single consequence #print ('~~~ this is a consequence in a line ') CSQcount = 0 for mcsq in multipleCsq: CSQcount += 1 myres = [] #myres+=[mychr, mypos] dCsq = dict(zip( csqHeader, mcsq.split("|"))) ############# ALL VEP INFO #print (dCsq) #myres.append(dCsq['Existing_variation']) #~~~~~~~~~~~ identify the allele with consequences linesplit[7] = tempinfo # reset info field mycsqAllele = dCsq["Allele"] GTfields = linesplit[9:] nbAploidSamples = len(GTfields) * 2 freqCSQ_REF_ALT = gp.AnnotateFreqCSQ_REF_ALT( mycsqAllele, myref, myalt, nbAploidSamples, GTfields) #print(freqCSQ_REF_ALT) linesplit[7] += ";" linesplit[ 7] += "nbCSQ=" # for specify the number of CSQ allele linesplit[7] += str( CSQcount) # for specify the number of CSQ allele linesplit[7] += ";CSQfreq=%s" % (freqCSQ_REF_ALT[0]) linesplit[7] += ";REFfreq=%s" % (freqCSQ_REF_ALT[1]) linesplit[7] += ";ALTfreq=%s" % (freqCSQ_REF_ALT[2]) #CSQfreq="CSQfreq=%s" %(freqCSQ_REF_ALT[0]) #REFfreq="REFfreq=%s" %(freqCSQ_REF_ALT[1]) #ALTfreq="ALTfreq=%s" %(freqCSQ_REF_ALT[2]) #allfreq=CSQfreq,REFfreq,ALTfreq #freqField=";".join(map(str,allfreq)) #rowWithFreq=linesplit[7],freqField #row2print=";".join(map(str,rowWithFreq)) #linesplit[7]=row2print filemyres.write("\t".join(linesplit)) filemyres.write("\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-f", help="path to input file ",type=str,required=True) parser.add_argument("-v", help="path to table of vep consequences ",type=str, required= True) parser.add_argument("-o", help="path to output file ",type=str, required= True) parser.add_argument("-e", help="path to error file",type=str,required=True) parser.add_argument("-m", help="path to metadata file",type=str,required=True) parser.add_argument("-c", help="number of random cycle",type=int,required=False, default=1) parser.add_argument("-s", help="seed's number",type=int,required=True) args = parser.parse_args() #output = open(args.o,'w') #print(args) ############################################################# ##########~~~~~~~~~~~~~~ READ VEP consequences rank """read external file with info on VEP consequences """ dRank={"HIGH":"HIGH", "LOW": "LOW", "MODERATE":"MODERATE", "MODIFIER":"MODIFIER"} dSOTermRank={} lSOTerm=[] ### list of SOTerm ordered by severity countlinesCsq= True for csqLine in open(args.v, 'r'): if countlinesCsq: csqTitle=csqLine.rstrip().split('\t') countlinesCsq=False else: myRowList=csqLine.rstrip().split('\t') dCsq= dict(zip(csqTitle, myRowList )) dSOTermRank[dCsq['SO term']]=dRank[dCsq['IMPACT']] lSOTerm.append(myRowList[0]) ##########~~~~~~~~~~~~~~ Read metadata Region=[] Sample=[] for line in open(args.m, 'r'): if re.match('sample', line): header= line.rstrip().split() else: other=line.rstrip().split() dMeta= dict(zip(header, other)) Region.append(dMeta['region']) Sample.append(dMeta['sample']) dSampleRegion=dict(zip(Sample, Region)) EUR = [key for (key, value) in dSampleRegion.items() if value == 'EUROPE'] EURsorted = sorted(EUR) ## needed for seed random.seed(args.s) ## need a sorted list of EUR #sampleToConsider=random.sample(EUR, 6) ##########~~~~~~~~~~~~~~ Loop of vcf lines sys.stdout=open(args.o, 'w') listOfErrors=[] print("Chr","\t", "Pos","\t","VariantClass","\t", "CSQallele","\t","CSQrank","\t","Consequence","\t","CSQfreq","\t","REFfreq","\t","ALTfreq","\t","MAF","\t","Cycle","\t","Population") cycle=0 while cycle < args.c: cycle+=1 myinput=gzip.open(args.f, 'r') dInfo={} column2retain=[] sampleToConsider=random.sample(EURsorted, 6) for line in myinput: decodedLine=line.decode() ## line.decode() is necessary to read encoded data using gzip in python3 if re.match('##', decodedLine): if re.search("ID=CSQ" , decodedLine ): csqHeader=decodedLine.rstrip().split(":")[1].lstrip().rstrip("\">").split("|") elif re.match('#', decodedLine): for ind in sampleToConsider: column2retain.append(decodedLine.split().index(ind)) else: #print("this is a new line ") ## line split by tab linesplit=decodedLine.rstrip().split() mychr=linesplit[0]; mypos=linesplit[1]; myref=linesplit[3]; myalt=linesplit[4] ## basic info ##~~ split INFO field tempinfo=linesplit[7] for i in tempinfo.split(";"): if re.search('=', i): # check if INFO fields has a value corresponding temp=i.split("=") dInfo[temp[0]]=temp[1] else: pass ##~~ work on dInfo[CSQ] ##~~ split for multiple consequences separated by "," multipleCsq=dInfo["CSQ"].split(",") ##~~ single consequence #print ('~~~ this is a consequence in a line ') CSQcount=0 for mcsq in multipleCsq: CSQcount+=1 dCsq=dict(zip(csqHeader, mcsq.split("|") )) ############# ALL VEP INFO #~~~~~~~~~~~ identify the allele with consequences mycsqAllele=dCsq["Allele"] GTfields=[] for col in range(6): GTfields+=[linesplit[column2retain[col]]] nbAploidSamples=len(GTfields)*2 freqCSQ_REF_ALT=gp.AnnotateFreqCSQ_REF_ALT(mycsqAllele,myref, myalt, nbAploidSamples, GTfields) for cons in dCsq['Consequence'].split("&"): #~~~~~~~~~~~~ assign severity score at the most severe csq myindexes=[] myindexes.append(lSOTerm.index(cons)) mostSevereCsq=lSOTerm[min(myindexes)] print(linesplit[0],"\t",linesplit[1],"\t",dCsq["VARIANT_CLASS"],"\t",dCsq["Allele"],"\t",dSOTermRank[mostSevereCsq],"\t",cons,"\t",freqCSQ_REF_ALT[0],"\t",freqCSQ_REF_ALT[1],"\t",freqCSQ_REF_ALT[2],"\t",freqCSQ_REF_ALT[3],"\t",cycle,"\t","HGDP")