def readFiles(files, fileType='beagle', chrom=None): nFiles=len(files) if fileType=='beagle': files=fileReader.concurrentFileReader(*files) subjects=files.next()[0] elif fileType=='tped': tfams=[f.replace('.tped', '.tfam') for f in files] tfams=[fileReader.openfile(f) for f in tfams] subjects=[] for f in tfams: subs=[[l.split(None, 1)[0]+'_a',l.split(None, 1)[0]+'_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files=fileReader.concurrentFileReader(*files, nHeaders=0, key=[0,1], nLabels=4) else: sys.stderr.write('ERROR: Filetype has to be either beagle or tped') sys.exit() snpNames=[]; snpPos=[]; pops=[[] for i in range(nFiles)] for s, l in files: if fileType=='tped': if chrom!=None and chrom!=s[0]: continue s=[s[1], s[3]] snpNames.append(s[0]) snpPos.append(int(s[1])) for i in range(nFiles): pops[i].append(l[i]) nSNPs=len(snpNames) pops=map(np.asarray, pops) nPops=[l.shape[1] for l in pops] return pops, nPops, subjects, nSNPs, snpPos, snpNames
def readFiles(fileNames, isBeagle=True): snpNames = [] snpLocations = [] #stores physical location from files vals = [] #Stores Values of genotypes if isBeagle: files = fileReader.concurrentFileReader(*fileNames, key=0) subjects = files.next()[0] else: tfams = [f.replace('.tped', '.tfam') for f in fileNames] tfams = [fileReader.openfile(f) for f in tfams] subjects = [] for f in tfams: subs = [[l.split(None, 2)[1] + '_a', l.split(None, 2)[1] + '_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files = fileReader.concurrentFileReader(*fileNames, nHeaders=0, key=[0, 1], nLabels=4) labels = np.asarray( sum([[i] * len(sub) for i, sub in enumerate(subjects)], [])) for i, (snpInfo, snps) in enumerate(files): if isBeagle: snpLocations.append(float(snpInfo[1])) snpNames.append(snpInfo[0]) else: snpLocations.append(float(snpInfo[3])) snpNames.append(snpInfo[1]) vals.append(fileReader.nucleotides2Haplotypes(sum(snps, []))) vals = np.asarray(vals).T snpLocations = np.asarray(snpLocations) return subjects, labels, snpNames, snpLocations, vals
def readFiles(fileNames, isBeagle=True): snpNames=[] snpLocations=[] #stores physical location from files vals=[] #Stores Values of genotypes if isBeagle: files=fileReader.concurrentFileReader(*fileNames, key=0) subjects=files.next()[0] else: tfams=[f.replace('.tped', '.tfam') for f in fileNames] tfams=[fileReader.openfile(f) for f in tfams] subjects=[] for f in tfams: subs=[[l.split(None, 2)[1]+'_a',l.split(None, 2)[1]+'_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files=fileReader.concurrentFileReader(*fileNames, nHeaders=0, key=[0,1], nLabels=4) labels=np.asarray(sum([[i]*len(sub) for i, sub in enumerate(subjects)], [])) for i, (snpInfo,snps) in enumerate(files): if isBeagle: snpLocations.append(float(snpInfo[1])) snpNames.append(snpInfo[0]) else: snpLocations.append(float(snpInfo[3])) snpNames.append(snpInfo[1]) vals.append(fileReader.nucleotides2Haplotypes(sum(snps, []))) vals=np.asarray(vals).T snpLocations=np.asarray(snpLocations) return subjects, labels, snpNames, snpLocations, vals
def success(originFile, admixedClassPre, admixedClass, winSize=WINSIZE): correct=np.array([l.split()[2:] for l in fileReader.openfile(originFile).readlines()[1:]], np.float) #Compare and find successRate svmClass=np.repeat(admixedClassPre, winSize, 0)[:len(correct),:] hmmClass=np.repeat(admixedClass, winSize, 0)[:len(correct),:] svmSuccess=100*(svmClass==correct).sum(0)/float(len(correct)) hmmSuccess=100*(hmmClass==correct).sum(0)/float(len(correct)) return np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(svmSuccess), np.std(svmSuccess)
def readFst(): fst={} fp=fileReader.openfile(FILEFST+'.gz') for l in fp: pop1, pop2, val=l.strip().split('\t') fst.setdefault(pop1, {})[pop2]=float(val) fst.setdefault(pop2, {})[pop1]=float(val) fp.close() return fst
def readFst(): fst = {} fp = fileReader.openfile(FILEFST + '.gz') for l in fp: pop1, pop2, val = l.strip().split('\t') fst.setdefault(pop1, {})[pop2] = float(val) fst.setdefault(pop2, {})[pop1] = float(val) fp.close() return fst
def success(originFile, admixedClassPre, admixedClass, winSize=WINSIZE): correct = np.array([ l.split()[2:] for l in fileReader.openfile(originFile).readlines()[1:] ], np.float) #Compare and find successRate svmClass = np.repeat(admixedClassPre, winSize, 0)[:len(correct), :] hmmClass = np.repeat(admixedClass, winSize, 0)[:len(correct), :] svmSuccess = 100 * (svmClass == correct).sum(0) / float(len(correct)) hmmSuccess = 100 * (hmmClass == correct).sum(0) / float(len(correct)) return np.mean(hmmSuccess), np.std(hmmSuccess), np.mean( svmSuccess), np.std(svmSuccess)
def determineAllChromosomes(fileNames): """Finds all unique chromosome names in first column of tped file file names. Parameters: - `fileNames` - List of fileNames """ fp = fileReader.openfile(fileNames[0]) found = {} vals = [] for l in fp: l = l.split(None, 2)[0] if l in found: continue found[l] = None vals.append(l) return vals
def determineAllChromosomes(fileNames): """Finds all unique chromosome names in first column of tped file file names. Parameters: - `fileNames` - List of fileNames """ fp=fileReader.openfile(fileNames[0]) found={} vals=[] for l in fp: l=l.split(None, 2)[0] if l in found: continue found[l]=None vals.append(l) return vals
def readFiles(files, fileType='beagle', chrom=None): nFiles = len(files) if fileType == 'beagle': files = fileReader.concurrentFileReader(*files) subjects = files.next()[0] elif fileType == 'tped': tfams = [f.replace('.tped', '.tfam') for f in files] tfams = [fileReader.openfile(f) for f in tfams] subjects = [] for f in tfams: subs = [[l.split(None, 1)[0] + '_a', l.split(None, 1)[0] + '_b'] for l in f] subjects.append(np.asarray(sum(subs, []))) files = fileReader.concurrentFileReader(*files, nHeaders=0, key=[0, 1], nLabels=4) else: sys.stderr.write('ERROR: Filetype has to be either beagle or tped') sys.exit() snpNames = [] snpPos = [] pops = [[] for i in range(nFiles)] for s, l in files: if fileType == 'tped': if chrom != None and chrom != s[0]: continue s = [s[1], s[3]] snpNames.append(s[0]) snpPos.append(int(s[1])) for i in range(nFiles): pops[i].append(l[i]) nSNPs = len(snpNames) pops = map(np.asarray, pops) nPops = [l.shape[1] for l in pops] return pops, nPops, subjects, nSNPs, snpPos, snpNames
pylab.savefig('fig2.'+FILETYPE,format=FILETYPE) ################################ # Figure 3 - Simulated Qatari ################################ simQatarPops=np.load('data/simulatedQatar.populations.npy') colors=[POPCOLORS[label] for label in simQatarPops] simQatarAncestry=np.loadtxt('data/simulatedQatar.admixedClass.csv') simQatarAncestryP=np.loadtxt('data/simulatedQatar.posterior.csv') nsimQatarWins, nsimQatarSubs=simQatarAncestry.shape simQatarColors=np.zeros((nsimQatarSubs,nsimQatarWins,4)) for i in range(nsimQatarWins): for j in range(nsimQatarSubs): simQatarColors[j,i,:]=colors[int(simQatarAncestry[i,j])] simQatarColors=simQatarColors/255. simQatarCorrect=np.asarray([l.strip().split('\t')[2:] for l in fileReader.openfile('data/hgdp3/admixed_hgdp_origin_yoruba_bedouin_brahui.chr1.csv.gz').readlines()[1:]], np.int) simQatarCorrect[simQatarCorrect==0]=np.nonzero(simQatarPops==['yoruba'])[0][0] simQatarCorrect[simQatarCorrect==1]=np.nonzero(simQatarPops==['bedouin'])[0][0] simQatarCorrect[simQatarCorrect==2]=np.nonzero(simQatarPops==['brahui'])[0][0] nsimQatarWins, nsimQatarSubs=simQatarCorrect.shape simQatarCorrectColors=np.zeros((nsimQatarSubs,nsimQatarWins,4)) for i in range(nsimQatarWins): for j in range(nsimQatarSubs): simQatarCorrectColors[j,i,:]=colors[int(simQatarCorrect[i,j])] simQatarCorrectColors=simQatarCorrectColors/255. comparison=np.repeat(simQatarAncestry, 200, 0)[:nsimQatarWins,:] success=100*(comparison==simQatarCorrect).sum(0)/float(nsimQatarWins) print 'Correct %0.3g +/- %0.2g' %(np.mean(success), np.std(success)) #Convert to fuzzy correct yoruba=pylab.find(simQatarPops==('yoruba'))[0] for tmpPop in ['mandenka', 'bantu_n.e.', 'biaka_pygmies', 'mbuti_pygmies']:
################################ # Figure 3 - Simulated Qatari ################################ simQatarPops = np.load('data/simulatedQatar.populations.npy') colors = [POPCOLORS[label] for label in simQatarPops] simQatarAncestry = np.loadtxt('data/simulatedQatar.admixedClass.csv') simQatarAncestryP = np.loadtxt('data/simulatedQatar.posterior.csv') nsimQatarWins, nsimQatarSubs = simQatarAncestry.shape simQatarColors = np.zeros((nsimQatarSubs, nsimQatarWins, 4)) for i in range(nsimQatarWins): for j in range(nsimQatarSubs): simQatarColors[j, i, :] = colors[int(simQatarAncestry[i, j])] simQatarColors = simQatarColors / 255. simQatarCorrect = np.asarray([ l.strip().split('\t')[2:] for l in fileReader.openfile( 'data/hgdp3/admixed_hgdp_origin_yoruba_bedouin_brahui.chr1.csv.gz' ).readlines()[1:] ], np.int) simQatarCorrect[simQatarCorrect == 0] = np.nonzero( simQatarPops == ['yoruba'])[0][0] simQatarCorrect[simQatarCorrect == 1] = np.nonzero( simQatarPops == ['bedouin'])[0][0] simQatarCorrect[simQatarCorrect == 2] = np.nonzero( simQatarPops == ['brahui'])[0][0] nsimQatarWins, nsimQatarSubs = simQatarCorrect.shape simQatarCorrectColors = np.zeros((nsimQatarSubs, nsimQatarWins, 4)) for i in range(nsimQatarWins): for j in range(nsimQatarSubs): simQatarCorrectColors[j, i, :] = colors[int(simQatarCorrect[i, j])] simQatarCorrectColors = simQatarCorrectColors / 255. comparison = np.repeat(simQatarAncestry, 200, 0)[:nsimQatarWins, :]
# pylab.plot(np.vstack(allAdmixedClass).mean(0), range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy'); pylab.xlim(0,1); pylab.xticks([0,.5,1], fontsize=8) # pylab.plot(alphas, range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy', fontsize=8); pylab.xlim(0,1); pylab.xticks([0,.5,1]) #Colorbar pylab.axes([0.88, .08, .06, .9]) pylab.xlim(-1, 1) pylab.ylim(-1, 1) pylab.axis('off') cbar = pylab.colorbar(fraction=.1, ticks=[-1, -0.5, 0, 0.5, 1]) cbar.ax.set_yticklabels(['1.0 Bantu', '0.5', '0', '0.5', '1.0 Pygmy'], fontsize=6) pylab.suptitle('Chromosomes') # If simulated data calculate Success Rate if locals().get('correctFile'): correct = np.array([ l.split()[4:] for l in fileReader.openfile(correctFile).readlines()[1:] ], np.float) svmClass = np.repeat(admixedClassPre, winSize, 0) hmmClass = np.repeat(admixedClass, winSize, 0) svmSuccess = 100 - sum( abs(svmClass[:len(correct), :] - correct)) / len(correct) * 100 hmmSuccess = 100 - sum( abs(hmmClass[:len(correct), :] - correct)) / len(correct) * 100 print 'Correct %0.3g +/- %0.2g (%0.3g +/- %0.2g)' % ( np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(svmSuccess), np.std(svmSuccess)) # pylab.figure(); # pylab.subplot(2,1,1); # pylab.imshow(((admixedClass*2-1)*p).T, interpolation='nearest', cmap=pylab.cm.copper, vmin=0, vmax=2) # pylab.ylabel('Sample '); pylab.yticks([]); pylab.xticks([]); pylab.axis('tight'); pylab.title('Estimat') # pylab.subplot(2,1,2);
pylab.draw() xStart+=(xWidth+0.002) # #Plot average Pygmy Ancestry per sample # pylab.axes([0.92, 0.29, .07, .52]) # pylab.plot(np.vstack(allAdmixedClass).mean(0), range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy'); pylab.xlim(0,1); pylab.xticks([0,.5,1], fontsize=8) # pylab.plot(alphas, range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy', fontsize=8); pylab.xlim(0,1); pylab.xticks([0,.5,1]) #Colorbar pylab.axes([0.88, .08, .06, .9]); pylab.xlim(-1, 1); pylab.ylim(-1, 1); pylab.axis('off') cbar=pylab.colorbar(fraction=.1, ticks=[-1, -0.5, 0, 0.5, 1]) cbar.ax.set_yticklabels(['1.0 Bantu', '0.5', '0', '0.5', '1.0 Pygmy'], fontsize=6) pylab.suptitle('Chromosomes') # If simulated data calculate Success Rate if locals().get('correctFile'): correct=np.array([l.split()[4:] for l in fileReader.openfile(correctFile).readlines()[1:]], np.float) svmClass=np.repeat(admixedClassPre, winSize, 0) hmmClass=np.repeat(admixedClass, winSize, 0) svmSuccess=100-sum(abs(svmClass[:len(correct),:]-correct))/len(correct)*100 hmmSuccess=100-sum(abs(hmmClass[:len(correct),:]-correct))/len(correct)*100 print 'Correct %0.3g +/- %0.2g (%0.3g +/- %0.2g)' %(np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(svmSuccess), np.std(svmSuccess)) # pylab.figure(); # pylab.subplot(2,1,1); # pylab.imshow(((admixedClass*2-1)*p).T, interpolation='nearest', cmap=pylab.cm.copper, vmin=0, vmax=2) # pylab.ylabel('Sample '); pylab.yticks([]); pylab.xticks([]); pylab.axis('tight'); pylab.title('Estimat') # pylab.subplot(2,1,2); # pylab.imshow(correct[:, :].T, interpolation='nearest', cmap=pylab.cm.copper, vmin=0, vmax=2) # pylab.ylabel('Sample');pylab.yticks([]); pylab.xticks([]) # pylab.xlabel('Position along %s' %CHROM); pylab.axis('tight'); pylab.title('True ancestry')