Example #1
0
    def importFile(self):
        def convert(s):
            if len(s) == 1: return s, s
            elif len(s) == 2: return s[0], s[1]

        """ Merges duplicate samples """
        self.ped = libPed.Ped()
        head, firstline = False, True
        for line in self.fgeno:
            l = line.strip()
            if firstline:
                l = l.strip(',').split(',')
                self.mark = libMark.Mark(l)
                firstline = False
            else:
                l = l.split(',')
                animal, genolist, genos = l[0], l[1:], []
                for el in genolist:
                    if len(el) == 1: el = el + el
                    if len(el) == 0:
                        sys.stderr.write('Error in importFile: %s\n' %
                                         (animal))
                        sys.exit(1)
                    genos.append(el[0])
                    genos.append(el[1])
                self.addGenotype(animal, genos)
                self.ped.addAnimal(animal, '0', '0', 'F0', '3')
        self.ped.updateSex()
Example #2
0
    def importFile(self):
        """ Merges duplicate samples """
        self.ped = libPed.Ped()
        firstline = True
        for line in self.fgeno:
            if line.strip().startswith('#'):
                if firstline:
                    line = line.strip('#').strip().split()
                    self.mark = libMark.Mark(line)
                    firstline = False
                continue
            l = line.strip().split()
            if len(l) == len(
                    self.mark) + 1 or len(l) == len(self.mark) * 2 + 1:
                animal, sire, dam, geno = l[0], '0', '0', l[1:]
            elif len(l) == len(
                    self.mark) + 3 or len(l) == len(self.mark) * 2 + 3:
                animal, sire, dam, geno = l[0], l[1], l[2], l[3:]
            elif len(l) == 0:
                pass
            else:
                sys.stdout.write(
                    'Found %d genotypes for %s, expected %d markers\n' %
                    (len(l) - 3, l[0], len(self.mark)))
                continue
                #sys.exit(1)
            newgeno = []
            # This part is to fix files where the 2 alleles have been combined into one
            if len(geno) == len(self.mark):
                for a in geno:
                    if len(a) == 2:
                        newgeno += [trans(a[0])] + [trans(a[1])]
                    elif len(a) == 1:
                        newgeno += [trans(a)] + [trans(a)]
                    elif len(a) == 4:  # Includes either 'DEL' or 'INS'
                        if a[:3] == 'DEL': newgeno += ['D'] + [trans(a[3])]
                        elif a[1:] == 'DEL': newgeno += [trans(a[0])] + ['D']
                        elif a[:3] == 'INS': newgeno += ['I'] + [trans(a[3])]
                        elif a[1:] == 'INS': newgeno += [trans(a[0])] + ['I']
                        else:
                            sys.stderr.write(
                                'Error in importing, unknown allele: %s\n' % a)
                            sys.exit(1)
                    elif len(a) == 6:
                        if a == 'DELDEL': newgeno += ['D'] + ['D']
                        elif a == 'DELINS': newgeno += ['D'] + ['I']
                        elif a == 'INSDEL': newgeno += ['I'] + ['D']
                        elif a == 'INSINS': newgeno += ['I'] + ['I']
                        else:
                            sys.stderr.write(
                                'Error in importing, unknown allele: %s\n' % a)
                            sys.exit(1)

                geno = newgeno
            self.ped.addAnimal(animal, dam, sire, 'F0', '3')
            self.addGenotype(animal, geno)
            self.ped.updateSex()
Example #3
0
 def importFile(self):
     """ Merges duplicate samples """
     print "This option is not available at this point in time"
     sys.exit(1)
     self.ped = libPed.Ped()
     self.mark = libMark.Mark()
     self.ped.addAnimal(animal, dam, sire, family, sex)
     self.addGenotype(animal, geno.split())
     self.ped.updateSex()
Example #4
0
 def importFile(self):
     """ Merges duplicate samples """
     self.ped = libPed.Ped()
     self.mark = libMark.Mark()
     marks = {}
     genos = {}
     reading = False
     firstline = True
     self.gcdict = {}
     for line in self.fgeno:
         l = line.strip().split(',')
         if line.startswith('#'): continue
         if len(l) == 6:
             (animal, marker, a, t1, gc,
              t3) = l[0], l[1], l[2], l[3], l[4], l[5]
         elif len(l) < 4:
             continue
         else:
             sys.stderr.write('Unexpected number of elements: %s \n' %
                              line.strip())
             sys.exit(1)
         if animal == 'Sample': continue
         if a == 'NA': a1, a2 = '0', '0'
         elif len(a) == 1: a1, a2 = a, a
         elif a == 'DEL': a1, a2 = 'D', 'D'
         elif a == 'INS': a1, a2 = 'I', 'I'
         elif '.' in a:
             try:
                 a1, a2 = a.split('.')
             except ValueError:
                 sys.stderr.write('Unknown allele %s\n' % a)
                 sys.exit(1)
             if a2 == 'DEL': a2 = 'D'
             if a1 == 'DEL': a1 = 'D'
             if a2 == 'INS': a2 = 'I'
             if a1 == 'INS': a1 = 'I'
         elif len(a) == 2: a1, a2 = a[0], a[1]
         else:
             sys.stderr.write('Unknown allele %s\n' % a)
             sys.exit(1)
         if animal not in genos: genos[animal], self.gcdict[animal] = {}, {}
         if marker not in marks: marks[marker] = 1
         genos[animal][marker] = [a1, a2]
         self.gcdict[animal][marker] = gc
     for mark in marks:
         self.mark.addMarker(mark, '99')
     for animal in genos:
         gen = []
         for mark in marks:
             gen += genos[animal][mark]
         self.ped.addAnimal(animal, '0', '0', 'F0')
         self.addGenotype(animal, gen)
Example #5
0
def pedcheck(options):
    #if options.genotypefile[-3:0] == '.gz': fin = gzip.open(options.genotypefile,'r')
    #else: fin = open(options.genotypefile,'r')
    #***** Check requirements and read data *****
    if options.pedigree: pedigree = libPed.Ped(options.pedigree)
    else:
        print "Gathering pedigree from data"
        pedigree = libGeno.extractPedigree(options.genofile)
        #sys.stderr.write('Pedigree file needed.\n')
        #sys.exit(1)
    if options.markers: markers = libMark.Mark(options.markers)
    else:
        print "Gathering markers from data"
        markers = libGeno.extractBglMark(options.genofile)
        #sys.stderr.write('Marker file needed.\n')
        #sys.exit(1)
    checkAll, checkOrphans = False, False
    if ',' in options.pedlims: lim = options.pedlims.split(',')
    else: lim = (options.pedlims, options.pedlims)
    oldLim, newLim = float(lim[0]), float(lim[1])
    r = {}
    hits = {}
    count = 0
    for anim in pedigree.getAnimals():
        r[anim] = count
        count += 1
    #****** Reading data, converting if needed
    gen2 = None
    fout = None
    if options.reportfile: fout = open(options.reportfile, 'w')
    fin = open(options.genofile, 'r')
    rows = len(pedigree)
    columns = len(markers)
    gen = np.zeros((rows, columns))
    marklist = fin.next().strip('#').strip().split()
    newanims = {}
    for line in fin:
        l = line.strip().split()
        icolumn = 0
        irow = r[l[0]]
        newanims[l[0]] = 1
        for i in xrange(0, len(marklist) * 2, 2):
            a1, a2 = l[i + 3:i + 5]
            try:
                m1, m2 = markers.getMAlleles(marklist[icolumn])
            except KeyError:
                if marklist[icolumn] not in markers.getMarkers():
                    print "WARNING! Incomplete markerlist"
                    a1, a2 = '0', '0'
                else:
                    print "ERROR! Failed to assign marker alleles", marklist[
                        icolumn]
                    sys.exit(1)
            if m1 == m2 or '0' in m1 + m2: gen[irow, icolumn] = 1
            else:
                gen[irow, icolumn] = tbase012(a1, a2, tbasenum(m1),
                                              tbasenum(m2))
            icolumn += 1
    fin.close()
    if options.genofile2:
        fin = open(options.genofile2, 'r')
        for line in fin:
            if line.startswith('#'): continue
            l = line.strip().split()
            irow = r[l[0]]
            icolumn = 0
            for i in xrange(0, len(marklist) * 2, 2):
                a1, a2 = l[i + 3:i + 5]
                try:
                    m1, m2 = markers.getMAlleles(marklist[icolumn])
                except KeyError:
                    if marklist[icolumn] not in markers.getMarkers():
                        print "WARNING! Incomplete markerlist"
                        a1, a2 = '0', '0'
                    else:
                        print "ERROR! Failed to assign marker alleles"
                        sys.exit(1)
                if m1 == m2 or '0' in m1 + m2: gen[irow, icolumn] = 1
                else:
                    gen[irow, icolumn] = tbase012(a1, a2, tbasenum(m1),
                                                  tbasenum(m2))
                icolumn += 1
            irow += 1
        fin.close()
    out = ''
    sep = '\t'
    if fout:
        fout.write(
            '#ID\tparent\tdiscords\tinfo_sites\tdiscord%\tcategory_sex\n')
    for anim in pedigree.getAnimals():
        if anim not in newanims: continue
        mismatch = False
        sire, dam = pedigree.getSire(anim), pedigree.getDam(anim)
        if sire != '0' and sire in r and not checkAll:
            res = gen[r[sire], :] - gen[r[anim], :]
            wrong = len(res[res == 2]) + len(res[res == -2])
            identical = len(res[res == 0])
            info = len(res) - len(res[np.isnan(res)])
            if info == 0:
                out += anim + sep + sire + sep + str(wrong) + sep + str(
                    info) + sep + '-1' + sep + '1' + '\n'
            else:
                out += anim + sep + sire + sep + str(wrong) + sep + str(
                    info) + sep + str(wrong * 100.0 / info) + sep + '1' + '\n'
            if info == 0: pass
            elif wrong * 100.0 / info > oldLim:
                mismatch = True
                if fout:
                    fout.write(
                        '%s\t%s\t%d\t%d\t%.3f\t%s\n' %
                        (anim, sire, wrong, info, wrong * 100.0 / info, 'W1'))
            hits[anim, sire] = wrong, info
        if dam != '0' and dam in r and not checkAll:
            res = gen[r[dam], :] - gen[r[anim], :]
            wrong = len(res[res == 2]) + len(res[res == -2])
            identical = len(res[res == 0])
            info = len(res) - len(res[np.isnan(res)])
            if info == 0:
                out += anim + sep + dam + sep + str(wrong) + sep + str(
                    info) + sep + '-1' + sep + '0' + '\n'
            else:
                out += anim + sep + dam + sep + str(wrong) + sep + str(
                    info) + sep + str(wrong * 100.0 / info) + sep + '0' + '\n'
            if info == 0: pass
            elif wrong * 100.0 / info > oldLim:
                mismatch = True
                if fout:
                    fout.write(
                        '%s\t%s\t%d\t%d\t%.3f\t%s\n' %
                        (anim, dam, wrong, info, wrong * 100.0 / info, 'W0'))
            hits[anim, dam] = wrong, info
        if sire == '0' and dam == '0' and len(
                pedigree.getOffspring(anim)) == 0:
            mismatch = True
        if mismatch:  # Search for better matches
            for anim2 in pedigree.getAnimals():
                if anim2 == anim: continue
                sex = pedigree.getSex(anim2)
                if (anim, anim2) in hits:
                    wrong, info = hits[anim, anim2]
                    rep = True
                elif (anim2, anim) in hits:
                    wrong, info = hits[anim2, anim]
                    rep = True
                else:
                    res = gen[r[anim2], :] - gen[r[anim], :]
                    wrong = len(res[res == 2]) + len(res[res == -2])
                    identical = len(res[res == 0])
                    info = len(res) - len(res[np.isnan(res)])
                    rep = False
                    hits[anim, anim2] = wrong, info
                if info == 0 or rep: pass
                elif wrong * 100.0 / info <= newLim:
                    if fout:
                        fout.write('%s\t%s\t%d\t%d\t%.3f\t%s\n' %
                                   (anim, anim2, wrong, info,
                                    wrong * 100.0 / info, 'N' + sex))
    if fout: fout.close()
    if len(out) > 0 and options.reportped:
        fout = open(options.reportped, 'w')
        fout.write(
            '#ID\tparent\tdiscords\tinfo_sites\tdiscord%\tparent_sex\n' + out)
        fout.close()