def importFile(self): def convert(s): if len(s) == 1: return s, s elif len(s) == 2: return s[0], s[1] """ Merges duplicate samples """ self.ped = libPed.Ped() head, firstline = False, True for line in self.fgeno: l = line.strip() if firstline: l = l.strip(',').split(',') self.mark = libMark.Mark(l) firstline = False else: l = l.split(',') animal, genolist, genos = l[0], l[1:], [] for el in genolist: if len(el) == 1: el = el + el if len(el) == 0: sys.stderr.write('Error in importFile: %s\n' % (animal)) sys.exit(1) genos.append(el[0]) genos.append(el[1]) self.addGenotype(animal, genos) self.ped.addAnimal(animal, '0', '0', 'F0', '3') self.ped.updateSex()
def importFile(self): """ Merges duplicate samples """ self.ped = libPed.Ped() firstline = True for line in self.fgeno: if line.strip().startswith('#'): if firstline: line = line.strip('#').strip().split() self.mark = libMark.Mark(line) firstline = False continue l = line.strip().split() if len(l) == len( self.mark) + 1 or len(l) == len(self.mark) * 2 + 1: animal, sire, dam, geno = l[0], '0', '0', l[1:] elif len(l) == len( self.mark) + 3 or len(l) == len(self.mark) * 2 + 3: animal, sire, dam, geno = l[0], l[1], l[2], l[3:] elif len(l) == 0: pass else: sys.stdout.write( 'Found %d genotypes for %s, expected %d markers\n' % (len(l) - 3, l[0], len(self.mark))) continue #sys.exit(1) newgeno = [] # This part is to fix files where the 2 alleles have been combined into one if len(geno) == len(self.mark): for a in geno: if len(a) == 2: newgeno += [trans(a[0])] + [trans(a[1])] elif len(a) == 1: newgeno += [trans(a)] + [trans(a)] elif len(a) == 4: # Includes either 'DEL' or 'INS' if a[:3] == 'DEL': newgeno += ['D'] + [trans(a[3])] elif a[1:] == 'DEL': newgeno += [trans(a[0])] + ['D'] elif a[:3] == 'INS': newgeno += ['I'] + [trans(a[3])] elif a[1:] == 'INS': newgeno += [trans(a[0])] + ['I'] else: sys.stderr.write( 'Error in importing, unknown allele: %s\n' % a) sys.exit(1) elif len(a) == 6: if a == 'DELDEL': newgeno += ['D'] + ['D'] elif a == 'DELINS': newgeno += ['D'] + ['I'] elif a == 'INSDEL': newgeno += ['I'] + ['D'] elif a == 'INSINS': newgeno += ['I'] + ['I'] else: sys.stderr.write( 'Error in importing, unknown allele: %s\n' % a) sys.exit(1) geno = newgeno self.ped.addAnimal(animal, dam, sire, 'F0', '3') self.addGenotype(animal, geno) self.ped.updateSex()
def importFile(self): """ Merges duplicate samples """ print "This option is not available at this point in time" sys.exit(1) self.ped = libPed.Ped() self.mark = libMark.Mark() self.ped.addAnimal(animal, dam, sire, family, sex) self.addGenotype(animal, geno.split()) self.ped.updateSex()
def importFile(self): """ Merges duplicate samples """ self.ped = libPed.Ped() self.mark = libMark.Mark() marks = {} genos = {} reading = False firstline = True self.gcdict = {} for line in self.fgeno: l = line.strip().split(',') if line.startswith('#'): continue if len(l) == 6: (animal, marker, a, t1, gc, t3) = l[0], l[1], l[2], l[3], l[4], l[5] elif len(l) < 4: continue else: sys.stderr.write('Unexpected number of elements: %s \n' % line.strip()) sys.exit(1) if animal == 'Sample': continue if a == 'NA': a1, a2 = '0', '0' elif len(a) == 1: a1, a2 = a, a elif a == 'DEL': a1, a2 = 'D', 'D' elif a == 'INS': a1, a2 = 'I', 'I' elif '.' in a: try: a1, a2 = a.split('.') except ValueError: sys.stderr.write('Unknown allele %s\n' % a) sys.exit(1) if a2 == 'DEL': a2 = 'D' if a1 == 'DEL': a1 = 'D' if a2 == 'INS': a2 = 'I' if a1 == 'INS': a1 = 'I' elif len(a) == 2: a1, a2 = a[0], a[1] else: sys.stderr.write('Unknown allele %s\n' % a) sys.exit(1) if animal not in genos: genos[animal], self.gcdict[animal] = {}, {} if marker not in marks: marks[marker] = 1 genos[animal][marker] = [a1, a2] self.gcdict[animal][marker] = gc for mark in marks: self.mark.addMarker(mark, '99') for animal in genos: gen = [] for mark in marks: gen += genos[animal][mark] self.ped.addAnimal(animal, '0', '0', 'F0') self.addGenotype(animal, gen)
def pedcheck(options): #if options.genotypefile[-3:0] == '.gz': fin = gzip.open(options.genotypefile,'r') #else: fin = open(options.genotypefile,'r') #***** Check requirements and read data ***** if options.pedigree: pedigree = libPed.Ped(options.pedigree) else: print "Gathering pedigree from data" pedigree = libGeno.extractPedigree(options.genofile) #sys.stderr.write('Pedigree file needed.\n') #sys.exit(1) if options.markers: markers = libMark.Mark(options.markers) else: print "Gathering markers from data" markers = libGeno.extractBglMark(options.genofile) #sys.stderr.write('Marker file needed.\n') #sys.exit(1) checkAll, checkOrphans = False, False if ',' in options.pedlims: lim = options.pedlims.split(',') else: lim = (options.pedlims, options.pedlims) oldLim, newLim = float(lim[0]), float(lim[1]) r = {} hits = {} count = 0 for anim in pedigree.getAnimals(): r[anim] = count count += 1 #****** Reading data, converting if needed gen2 = None fout = None if options.reportfile: fout = open(options.reportfile, 'w') fin = open(options.genofile, 'r') rows = len(pedigree) columns = len(markers) gen = np.zeros((rows, columns)) marklist = fin.next().strip('#').strip().split() newanims = {} for line in fin: l = line.strip().split() icolumn = 0 irow = r[l[0]] newanims[l[0]] = 1 for i in xrange(0, len(marklist) * 2, 2): a1, a2 = l[i + 3:i + 5] try: m1, m2 = markers.getMAlleles(marklist[icolumn]) except KeyError: if marklist[icolumn] not in markers.getMarkers(): print "WARNING! Incomplete markerlist" a1, a2 = '0', '0' else: print "ERROR! Failed to assign marker alleles", marklist[ icolumn] sys.exit(1) if m1 == m2 or '0' in m1 + m2: gen[irow, icolumn] = 1 else: gen[irow, icolumn] = tbase012(a1, a2, tbasenum(m1), tbasenum(m2)) icolumn += 1 fin.close() if options.genofile2: fin = open(options.genofile2, 'r') for line in fin: if line.startswith('#'): continue l = line.strip().split() irow = r[l[0]] icolumn = 0 for i in xrange(0, len(marklist) * 2, 2): a1, a2 = l[i + 3:i + 5] try: m1, m2 = markers.getMAlleles(marklist[icolumn]) except KeyError: if marklist[icolumn] not in markers.getMarkers(): print "WARNING! Incomplete markerlist" a1, a2 = '0', '0' else: print "ERROR! Failed to assign marker alleles" sys.exit(1) if m1 == m2 or '0' in m1 + m2: gen[irow, icolumn] = 1 else: gen[irow, icolumn] = tbase012(a1, a2, tbasenum(m1), tbasenum(m2)) icolumn += 1 irow += 1 fin.close() out = '' sep = '\t' if fout: fout.write( '#ID\tparent\tdiscords\tinfo_sites\tdiscord%\tcategory_sex\n') for anim in pedigree.getAnimals(): if anim not in newanims: continue mismatch = False sire, dam = pedigree.getSire(anim), pedigree.getDam(anim) if sire != '0' and sire in r and not checkAll: res = gen[r[sire], :] - gen[r[anim], :] wrong = len(res[res == 2]) + len(res[res == -2]) identical = len(res[res == 0]) info = len(res) - len(res[np.isnan(res)]) if info == 0: out += anim + sep + sire + sep + str(wrong) + sep + str( info) + sep + '-1' + sep + '1' + '\n' else: out += anim + sep + sire + sep + str(wrong) + sep + str( info) + sep + str(wrong * 100.0 / info) + sep + '1' + '\n' if info == 0: pass elif wrong * 100.0 / info > oldLim: mismatch = True if fout: fout.write( '%s\t%s\t%d\t%d\t%.3f\t%s\n' % (anim, sire, wrong, info, wrong * 100.0 / info, 'W1')) hits[anim, sire] = wrong, info if dam != '0' and dam in r and not checkAll: res = gen[r[dam], :] - gen[r[anim], :] wrong = len(res[res == 2]) + len(res[res == -2]) identical = len(res[res == 0]) info = len(res) - len(res[np.isnan(res)]) if info == 0: out += anim + sep + dam + sep + str(wrong) + sep + str( info) + sep + '-1' + sep + '0' + '\n' else: out += anim + sep + dam + sep + str(wrong) + sep + str( info) + sep + str(wrong * 100.0 / info) + sep + '0' + '\n' if info == 0: pass elif wrong * 100.0 / info > oldLim: mismatch = True if fout: fout.write( '%s\t%s\t%d\t%d\t%.3f\t%s\n' % (anim, dam, wrong, info, wrong * 100.0 / info, 'W0')) hits[anim, dam] = wrong, info if sire == '0' and dam == '0' and len( pedigree.getOffspring(anim)) == 0: mismatch = True if mismatch: # Search for better matches for anim2 in pedigree.getAnimals(): if anim2 == anim: continue sex = pedigree.getSex(anim2) if (anim, anim2) in hits: wrong, info = hits[anim, anim2] rep = True elif (anim2, anim) in hits: wrong, info = hits[anim2, anim] rep = True else: res = gen[r[anim2], :] - gen[r[anim], :] wrong = len(res[res == 2]) + len(res[res == -2]) identical = len(res[res == 0]) info = len(res) - len(res[np.isnan(res)]) rep = False hits[anim, anim2] = wrong, info if info == 0 or rep: pass elif wrong * 100.0 / info <= newLim: if fout: fout.write('%s\t%s\t%d\t%d\t%.3f\t%s\n' % (anim, anim2, wrong, info, wrong * 100.0 / info, 'N' + sex)) if fout: fout.close() if len(out) > 0 and options.reportped: fout = open(options.reportped, 'w') fout.write( '#ID\tparent\tdiscords\tinfo_sites\tdiscord%\tparent_sex\n' + out) fout.close()