Ejemplo n.º 1
0
def readFiles(files, fileType='beagle', chrom=None):
    nFiles=len(files)
    if fileType=='beagle':
        files=fileReader.concurrentFileReader(*files)
        subjects=files.next()[0]
    elif fileType=='tped':
        tfams=[f.replace('.tped', '.tfam') for f in files]
        tfams=[fileReader.openfile(f) for f in tfams]
        subjects=[]
        for f in tfams:
            subs=[[l.split(None, 1)[0]+'_a',l.split(None, 1)[0]+'_b']  for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files=fileReader.concurrentFileReader(*files, nHeaders=0, key=[0,1], nLabels=4)
    else:
        sys.stderr.write('ERROR: Filetype has to be either beagle or tped')
        sys.exit()
    snpNames=[]; snpPos=[];  pops=[[] for i in range(nFiles)]
    for s, l in files:
        if fileType=='tped':
            if chrom!=None and chrom!=s[0]:
                continue
            s=[s[1], s[3]]
        snpNames.append(s[0])
        snpPos.append(int(s[1]))
        for i in range(nFiles):
            pops[i].append(l[i])
    nSNPs=len(snpNames)
    pops=map(np.asarray, pops)
    nPops=[l.shape[1] for l in pops]
    return pops,  nPops, subjects, nSNPs, snpPos, snpNames
Ejemplo n.º 2
0
def readFiles(fileNames, isBeagle=True):
    snpNames = []
    snpLocations = []  #stores physical location from files
    vals = []  #Stores Values of genotypes

    if isBeagle:
        files = fileReader.concurrentFileReader(*fileNames, key=0)
        subjects = files.next()[0]
    else:
        tfams = [f.replace('.tped', '.tfam') for f in fileNames]
        tfams = [fileReader.openfile(f) for f in tfams]
        subjects = []
        for f in tfams:
            subs = [[l.split(None, 2)[1] + '_a',
                     l.split(None, 2)[1] + '_b'] for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files = fileReader.concurrentFileReader(*fileNames,
                                                nHeaders=0,
                                                key=[0, 1],
                                                nLabels=4)
    labels = np.asarray(
        sum([[i] * len(sub) for i, sub in enumerate(subjects)], []))
    for i, (snpInfo, snps) in enumerate(files):
        if isBeagle:
            snpLocations.append(float(snpInfo[1]))
            snpNames.append(snpInfo[0])
        else:
            snpLocations.append(float(snpInfo[3]))
            snpNames.append(snpInfo[1])
        vals.append(fileReader.nucleotides2Haplotypes(sum(snps, [])))
    vals = np.asarray(vals).T
    snpLocations = np.asarray(snpLocations)
    return subjects, labels, snpNames, snpLocations, vals
Ejemplo n.º 3
0
def readFiles(fileNames, isBeagle=True):
    snpNames=[]
    snpLocations=[]  #stores physical location from files
    vals=[]          #Stores Values of genotypes

    if isBeagle:
        files=fileReader.concurrentFileReader(*fileNames, key=0)
        subjects=files.next()[0]
    else:
        tfams=[f.replace('.tped', '.tfam') for f in fileNames]
        tfams=[fileReader.openfile(f) for f in tfams]
        subjects=[]
        for f in tfams:
            subs=[[l.split(None, 2)[1]+'_a',l.split(None, 2)[1]+'_b']  for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files=fileReader.concurrentFileReader(*fileNames, nHeaders=0, key=[0,1], nLabels=4)
    labels=np.asarray(sum([[i]*len(sub) for i, sub in enumerate(subjects)], []))
    for i, (snpInfo,snps) in enumerate(files):
        if isBeagle:
            snpLocations.append(float(snpInfo[1]))
            snpNames.append(snpInfo[0])
        else:
            snpLocations.append(float(snpInfo[3]))
            snpNames.append(snpInfo[1])
        vals.append(fileReader.nucleotides2Haplotypes(sum(snps, [])))
    vals=np.asarray(vals).T
    snpLocations=np.asarray(snpLocations)
    return subjects, labels, snpNames, snpLocations, vals
Ejemplo n.º 4
0
def success(originFile, admixedClassPre, admixedClass, winSize=WINSIZE):
    correct=np.array([l.split()[2:] for l in fileReader.openfile(originFile).readlines()[1:]], np.float)
    #Compare and find successRate
    svmClass=np.repeat(admixedClassPre, winSize, 0)[:len(correct),:]
    hmmClass=np.repeat(admixedClass, winSize, 0)[:len(correct),:]
    svmSuccess=100*(svmClass==correct).sum(0)/float(len(correct))
    hmmSuccess=100*(hmmClass==correct).sum(0)/float(len(correct))
    return np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(svmSuccess), np.std(svmSuccess)
Ejemplo n.º 5
0
def readFst():
    fst={}
    fp=fileReader.openfile(FILEFST+'.gz')
    for l in fp:
        pop1, pop2, val=l.strip().split('\t') 
        fst.setdefault(pop1, {})[pop2]=float(val)
        fst.setdefault(pop2, {})[pop1]=float(val)
    fp.close()
    return fst
Ejemplo n.º 6
0
def readFst():
    fst = {}
    fp = fileReader.openfile(FILEFST + '.gz')
    for l in fp:
        pop1, pop2, val = l.strip().split('\t')
        fst.setdefault(pop1, {})[pop2] = float(val)
        fst.setdefault(pop2, {})[pop1] = float(val)
    fp.close()
    return fst
Ejemplo n.º 7
0
def success(originFile, admixedClassPre, admixedClass, winSize=WINSIZE):
    correct = np.array([
        l.split()[2:] for l in fileReader.openfile(originFile).readlines()[1:]
    ], np.float)
    #Compare and find successRate
    svmClass = np.repeat(admixedClassPre, winSize, 0)[:len(correct), :]
    hmmClass = np.repeat(admixedClass, winSize, 0)[:len(correct), :]
    svmSuccess = 100 * (svmClass == correct).sum(0) / float(len(correct))
    hmmSuccess = 100 * (hmmClass == correct).sum(0) / float(len(correct))
    return np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(
        svmSuccess), np.std(svmSuccess)
Ejemplo n.º 8
0
def determineAllChromosomes(fileNames):
    """Finds all unique chromosome names in first column of tped file
    file names.
    Parameters:
    - `fileNames` - List of fileNames
    """
    fp = fileReader.openfile(fileNames[0])
    found = {}
    vals = []
    for l in fp:
        l = l.split(None, 2)[0]
        if l in found: continue
        found[l] = None
        vals.append(l)
    return vals
Ejemplo n.º 9
0
def determineAllChromosomes(fileNames):
    """Finds all unique chromosome names in first column of tped file
    file names.
    Parameters:
    - `fileNames` - List of fileNames
    """
    fp=fileReader.openfile(fileNames[0])
    found={}
    vals=[]
    for l in fp:
        l=l.split(None, 2)[0]
        if l in found: continue
        found[l]=None
        vals.append(l)
    return vals
Ejemplo n.º 10
0
def readFiles(files, fileType='beagle', chrom=None):
    nFiles = len(files)
    if fileType == 'beagle':
        files = fileReader.concurrentFileReader(*files)
        subjects = files.next()[0]
    elif fileType == 'tped':
        tfams = [f.replace('.tped', '.tfam') for f in files]
        tfams = [fileReader.openfile(f) for f in tfams]
        subjects = []
        for f in tfams:
            subs = [[l.split(None, 1)[0] + '_a',
                     l.split(None, 1)[0] + '_b'] for l in f]
            subjects.append(np.asarray(sum(subs, [])))
        files = fileReader.concurrentFileReader(*files,
                                                nHeaders=0,
                                                key=[0, 1],
                                                nLabels=4)
    else:
        sys.stderr.write('ERROR: Filetype has to be either beagle or tped')
        sys.exit()
    snpNames = []
    snpPos = []
    pops = [[] for i in range(nFiles)]
    for s, l in files:
        if fileType == 'tped':
            if chrom != None and chrom != s[0]:
                continue
            s = [s[1], s[3]]
        snpNames.append(s[0])
        snpPos.append(int(s[1]))
        for i in range(nFiles):
            pops[i].append(l[i])
    nSNPs = len(snpNames)
    pops = map(np.asarray, pops)
    nPops = [l.shape[1] for l in pops]
    return pops, nPops, subjects, nSNPs, snpPos, snpNames
Ejemplo n.º 11
0
 pylab.savefig('fig2.'+FILETYPE,format=FILETYPE) 
 
 ################################
 # Figure 3 - Simulated Qatari
 ################################
 simQatarPops=np.load('data/simulatedQatar.populations.npy')
 colors=[POPCOLORS[label] for label in simQatarPops]
 simQatarAncestry=np.loadtxt('data/simulatedQatar.admixedClass.csv')
 simQatarAncestryP=np.loadtxt('data/simulatedQatar.posterior.csv')
 nsimQatarWins, nsimQatarSubs=simQatarAncestry.shape
 simQatarColors=np.zeros((nsimQatarSubs,nsimQatarWins,4))
 for i in range(nsimQatarWins):
     for j in range(nsimQatarSubs):
         simQatarColors[j,i,:]=colors[int(simQatarAncestry[i,j])]
 simQatarColors=simQatarColors/255.
 simQatarCorrect=np.asarray([l.strip().split('\t')[2:] for l in fileReader.openfile('data/hgdp3/admixed_hgdp_origin_yoruba_bedouin_brahui.chr1.csv.gz').readlines()[1:]], np.int)
 simQatarCorrect[simQatarCorrect==0]=np.nonzero(simQatarPops==['yoruba'])[0][0]
 simQatarCorrect[simQatarCorrect==1]=np.nonzero(simQatarPops==['bedouin'])[0][0]
 simQatarCorrect[simQatarCorrect==2]=np.nonzero(simQatarPops==['brahui'])[0][0]
 nsimQatarWins, nsimQatarSubs=simQatarCorrect.shape
 simQatarCorrectColors=np.zeros((nsimQatarSubs,nsimQatarWins,4))
 for i in range(nsimQatarWins):
     for j in range(nsimQatarSubs):
         simQatarCorrectColors[j,i,:]=colors[int(simQatarCorrect[i,j])]
 simQatarCorrectColors=simQatarCorrectColors/255.
 comparison=np.repeat(simQatarAncestry, 200, 0)[:nsimQatarWins,:]
 success=100*(comparison==simQatarCorrect).sum(0)/float(nsimQatarWins)
 print 'Correct %0.3g +/- %0.2g' %(np.mean(success), np.std(success))
 #Convert to fuzzy correct
 yoruba=pylab.find(simQatarPops==('yoruba'))[0]
 for tmpPop in ['mandenka', 'bantu_n.e.', 'biaka_pygmies', 'mbuti_pygmies']:
Ejemplo n.º 12
0
 ################################
 # Figure 3 - Simulated Qatari
 ################################
 simQatarPops = np.load('data/simulatedQatar.populations.npy')
 colors = [POPCOLORS[label] for label in simQatarPops]
 simQatarAncestry = np.loadtxt('data/simulatedQatar.admixedClass.csv')
 simQatarAncestryP = np.loadtxt('data/simulatedQatar.posterior.csv')
 nsimQatarWins, nsimQatarSubs = simQatarAncestry.shape
 simQatarColors = np.zeros((nsimQatarSubs, nsimQatarWins, 4))
 for i in range(nsimQatarWins):
     for j in range(nsimQatarSubs):
         simQatarColors[j, i, :] = colors[int(simQatarAncestry[i, j])]
 simQatarColors = simQatarColors / 255.
 simQatarCorrect = np.asarray([
     l.strip().split('\t')[2:] for l in fileReader.openfile(
         'data/hgdp3/admixed_hgdp_origin_yoruba_bedouin_brahui.chr1.csv.gz'
     ).readlines()[1:]
 ], np.int)
 simQatarCorrect[simQatarCorrect == 0] = np.nonzero(
     simQatarPops == ['yoruba'])[0][0]
 simQatarCorrect[simQatarCorrect == 1] = np.nonzero(
     simQatarPops == ['bedouin'])[0][0]
 simQatarCorrect[simQatarCorrect == 2] = np.nonzero(
     simQatarPops == ['brahui'])[0][0]
 nsimQatarWins, nsimQatarSubs = simQatarCorrect.shape
 simQatarCorrectColors = np.zeros((nsimQatarSubs, nsimQatarWins, 4))
 for i in range(nsimQatarWins):
     for j in range(nsimQatarSubs):
         simQatarCorrectColors[j, i, :] = colors[int(simQatarCorrect[i, j])]
 simQatarCorrectColors = simQatarCorrectColors / 255.
 comparison = np.repeat(simQatarAncestry, 200, 0)[:nsimQatarWins, :]
Ejemplo n.º 13
0
    # pylab.plot(np.vstack(allAdmixedClass).mean(0), range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy'); pylab.xlim(0,1); pylab.xticks([0,.5,1], fontsize=8)
    # pylab.plot(alphas, range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy', fontsize=8); pylab.xlim(0,1); pylab.xticks([0,.5,1])
    #Colorbar
    pylab.axes([0.88, .08, .06, .9])
    pylab.xlim(-1, 1)
    pylab.ylim(-1, 1)
    pylab.axis('off')
    cbar = pylab.colorbar(fraction=.1, ticks=[-1, -0.5, 0, 0.5, 1])
    cbar.ax.set_yticklabels(['1.0 Bantu', '0.5', '0', '0.5', '1.0 Pygmy'],
                            fontsize=6)
    pylab.suptitle('Chromosomes')

# If simulated data calculate Success Rate
if locals().get('correctFile'):
    correct = np.array([
        l.split()[4:] for l in fileReader.openfile(correctFile).readlines()[1:]
    ], np.float)
    svmClass = np.repeat(admixedClassPre, winSize, 0)
    hmmClass = np.repeat(admixedClass, winSize, 0)
    svmSuccess = 100 - sum(
        abs(svmClass[:len(correct), :] - correct)) / len(correct) * 100
    hmmSuccess = 100 - sum(
        abs(hmmClass[:len(correct), :] - correct)) / len(correct) * 100
    print 'Correct %0.3g +/- %0.2g (%0.3g +/- %0.2g)' % (
        np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(svmSuccess),
        np.std(svmSuccess))
    # pylab.figure();
    # pylab.subplot(2,1,1);
    # pylab.imshow(((admixedClass*2-1)*p).T, interpolation='nearest', cmap=pylab.cm.copper, vmin=0, vmax=2)
    # pylab.ylabel('Sample '); pylab.yticks([]); pylab.xticks([]); pylab.axis('tight'); pylab.title('Estimat')
    # pylab.subplot(2,1,2);
Ejemplo n.º 14
0
        pylab.draw()
        xStart+=(xWidth+0.002)
    # #Plot average Pygmy Ancestry per sample
    # pylab.axes([0.92, 0.29, .07, .52]) 
    # pylab.plot(np.vstack(allAdmixedClass).mean(0), range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy'); pylab.xlim(0,1); pylab.xticks([0,.5,1], fontsize=8)
    # pylab.plot(alphas, range(nPygmy)); pylab.ylim(nPygmy, 0); pylab.yticks([]); pylab.title('Percent Pygmy', fontsize=8); pylab.xlim(0,1); pylab.xticks([0,.5,1])
    #Colorbar
    pylab.axes([0.88, .08, .06, .9]);  pylab.xlim(-1, 1); pylab.ylim(-1, 1); pylab.axis('off')
    cbar=pylab.colorbar(fraction=.1, ticks=[-1, -0.5,  0, 0.5, 1])
    cbar.ax.set_yticklabels(['1.0 Bantu', '0.5', '0', '0.5', '1.0 Pygmy'], fontsize=6)
    pylab.suptitle('Chromosomes')


# If simulated data calculate Success Rate
if locals().get('correctFile'):
    correct=np.array([l.split()[4:] for l in fileReader.openfile(correctFile).readlines()[1:]], np.float)
    svmClass=np.repeat(admixedClassPre, winSize, 0)
    hmmClass=np.repeat(admixedClass, winSize, 0)
    svmSuccess=100-sum(abs(svmClass[:len(correct),:]-correct))/len(correct)*100
    hmmSuccess=100-sum(abs(hmmClass[:len(correct),:]-correct))/len(correct)*100
    print 'Correct %0.3g +/- %0.2g (%0.3g +/- %0.2g)' %(np.mean(hmmSuccess), np.std(hmmSuccess), np.mean(svmSuccess), np.std(svmSuccess))
    # pylab.figure();
    # pylab.subplot(2,1,1);
    # pylab.imshow(((admixedClass*2-1)*p).T, interpolation='nearest', cmap=pylab.cm.copper, vmin=0, vmax=2)
    # pylab.ylabel('Sample '); pylab.yticks([]); pylab.xticks([]); pylab.axis('tight'); pylab.title('Estimat')
    # pylab.subplot(2,1,2);
    # pylab.imshow(correct[:, :].T, interpolation='nearest', cmap=pylab.cm.copper, vmin=0, vmax=2)
    # pylab.ylabel('Sample');pylab.yticks([]); pylab.xticks([])
    # pylab.xlabel('Position along %s' %CHROM);  pylab.axis('tight'); pylab.title('True ancestry')