def buildTreeFromNewick(self): ''' Reads in a Newick-formatted tree, strips out bootstraps, tokenizes it, and initiates tree building. Returns a node instance: the root. ''' utils.checkFileExistence(self.config.primaryTreeFN, 'Primary tree') with open(self.config.primaryTreeFN, 'r') as treeFile: treeString = treeFile.readline().strip() self.errAndLog('\n%sRead primary tree topology:\n %s\n\n' % \ (utils.DASHES, self.config.primaryTreeFN)) ''' Tokenization: a. strip out bootstraps: text within brackets b. split on any semantic token: [%s] c. but group to retain retain tokens themselves: () d. then drop empty tokens from splitting adjacent semantic tokens ''' treeString = re.subn(r'\[.*?\]', '', treeString)[0] treeList = re.split('([%s])' % self.config.newickSemanticTokenString, treeString) treeList = [token for token in treeList if token is not ''] treeDeque = deque(treeList) hasLengths = ':' in treeDeque # determine whether tree has lengths root = self.addChildSubtreeFromNewickDeque(None, treeDeque, hasLengths) root.writeNewick(self.config.alignedPrimaryTreeFN, alignTips=True) return root
def generateResidList(): ''' 4 possibilities: residList specified at config instantiation -a -s RESID -> a single research ID has been specified -i FILENAME.resid.txt -> read research IDs from file -a -> return empty list to indicate no subsetting ''' residList = list() if Sample.config.residList: residList = Sample.config.residList Sample.errAndLog('Research ID list supplied.\n' + \ ' %8d resids (%d unique)\n\n' % (len(residList), len(set(residList)))) elif Sample.args.singleSampleID: resid = Customer.generateResid(Sample.args.singleSampleID) residList = [resid] Sample.errAndLog('Will call haplogroup for:\n %d\n\n' % resid) elif Sample.args.dataFN: utils.checkFileExistence(Sample.args.dataFN, 'Research IDs') Sample.errAndLog('Reading research IDs:\n %s\n' % Sample.args.dataFN) with open(Sample.args.dataFN, 'r') as residFile: for line in residFile: ID = line.strip().split()[0] residList.append(Customer.generateResid(ID)) Sample.errAndLog(' %8d read\n' % len(residList)) Sample.errAndLog(' %8d unique\n\n' % len(set(residList))) return residList
def parseIsoggTable(self): 'parses ISOGG table' # input reader utils.checkFileExistence(self.config.isoggFN, 'Isogg') isoggInFile = open(self.config.isoggFN, 'r') isoggReader = csv.reader(isoggInFile, delimiter='\t') isoggReader.next() # ignore header # output file handles if self.config.suppressOutputAndLog: isoggOutFile = None isoggDropOutFile = None else: isoggOutFile = open(self.config.cleanedIsoggFN, 'w') isoggDropOutFile = open(self.config.droppedIsoggFN, 'w') droppedMarkerList = list() for lineList in isoggReader: self.isoggCountsDict['read'] += 1 # clean up data row and extract values lineList = [element.strip() for element in lineList] if lineList[ 1] == '': # when present, remove extra tab after snp name del lineList[1] if len(lineList) != 6: self.isoggCountsDict['badLines'] += 1 continue name, haplogroup, _, _, position, mutation = lineList # apply corrections if name in self.isoggCorrectionDict: haplogroup, position, mutation = self.isoggCorrectionDict[name] self.numSNPsCorrected += 1 # identify markers to drop recordIsBad, markerIsOkToRepresentNode = \ self.checkIsoggRecord(name, haplogroup, position, mutation) if recordIsBad: self.isoggCountsDict['dropped'] += 1 if isoggDropOutFile: isoggDropOutFile.write('%-10s %-25s %8s %s\n' % \ (name, haplogroup, position, mutation)) if markerIsOkToRepresentNode: droppedMarkerList.append(DroppedMarker(name, haplogroup)) continue # process retained SNPs self.isoggCountsDict['retained'] += 1 position = int(position) if isoggOutFile: isoggOutFile.write('%-10s %-25s %8d %s\n' % \ (name, haplogroup, position, mutation)) self.constructSNP(name, haplogroup, position, mutation) self.addDroppedMarkersToNodes(droppedMarkerList) utils.closeFiles([isoggInFile, isoggOutFile, isoggDropOutFile])
def readPreferredSNPnameSet(self): '''reads a set of widely known SNP names. presence on this list is the primary selection criterion for SNP labels''' preferredSNPnamesFN = self.config.preferredSNPnamesFN utils.checkFileExistence(preferredSNPnamesFN, 'Preferred SNP names') with open(preferredSNPnamesFN, 'r') as preferredSNPnamesFile: for line in preferredSNPnamesFile: self.preferredSNPnameSet.add(line.strip()) self.errAndLog( '%sRead preferred SNP names\n' % utils.DASHES + \ '%6d SNP names: %s\n\n' % \ (len(self.preferredSNPnameSet), preferredSNPnamesFN))
def importPrevCalledHaplogroups(): ''' reads file with previously called haplogroups, assuming first col = ID & last col = haplogroup ''' utils.checkFileExistence(Sample.config.prevCalledHgFN, 'Previously called haplogroups') with open(Sample.config.prevCalledHgFN, 'r') as prevCalledHgFile: for line in prevCalledHgFile: lineList = line.strip().split() ID, prevCalledHaplogroup = lineList[0], lineList[-1] Sample.prevCalledHaplogroupDict[ID] = prevCalledHaplogroup Sample.errAndLog('%sRead previously called haplogroups:\n %s\n\n' % \ (utils.DASHES, Sample.config.prevCalledHgFN))
def buildCustomerTupleListFromFile(): ''' builds a list of CustomerTuple instances from a two-column file. column 1: ID column 2: comma-separated list of platforms for this individual example: Sample314159 1,2,5 ''' utils.checkFileExistence(Sample.args.dataFN, 'Sample IDs') Sample.errAndLog('Reading sample IDs:\n %s\n' % Sample.args.dataFN) customerTupleList = list() IDset = set() with open(Sample.args.dataFN, 'r') as idFile: for line in idFile: tokenList = line.strip().split() if len(tokenList) != 2: sys.exit( 'ERROR. When specifying non-default ablock dataset,\n' + 'ID file must have 2 columns: ID, comma-separated list of integers\n' + 'indicating platform versions.\n') ID, platformVersions = tokenList IDset.add(ID) tupleKwargsDict = { 'resid': ID, 'y_haplogroup': Sample.config. missingHaplogroup, # previous call; not needed } platformVersionsSet = set( [int(i) for i in platformVersions.split(',')]) for i in xrange(1, Sample.config.maxPlatformVersionPlusOne): tupleKwargsDict['is_v%d' % i] = i in platformVersionsSet customerTuple = Sample.config.CustomerTuple(**tupleKwargsDict) customerTupleList.append(customerTuple) Sample.errAndLog(' %8d read\n' % len(customerTupleList)) Sample.errAndLog(' %8d unique\n\n' % len(IDset)) return customerTupleList
def buildPageDict(): ''' builds a dictionary of 23andMe content pages. pagesFN comes from these two gdocs: - https://docs.google.com/spreadsheets/d/1mf86slweZEKUd5hzG2GmKGTGIpHuDipJz2u221y2zVE/edit?ts=568eb997#gid=0 - https://docs.google.com/spreadsheets/d/1oo0sRmYFNeWikuOxcb_1obOoO35wQccmOzyGRmqDMtc/edit?ts=578578d0#gid=362797346 ''' utils.checkFileExistence(Node.config.pagesFN, 'Content pages') with open(Node.config.pagesFN, 'r') as pagesFile: pagesFile.readline() # header for line in pagesFile: yccOld, snpName = line.strip().split() page = Page(yccOld, snpName) Node.pageList.append(page) if yccOld == Node.config.rootHaplogroup: Node.pageDict[Node.config.rootHaplogroup] = page elif snpName != '.': Node.pageDict[snpName] = page
def readRepresentativeSNPnameSet(self): 'reads the names of SNPs deemed representative for their respective lineages' isoggRepSNPfn = self.config.isoggRepSNPfn otherRepSNPfn = self.config.otherRepSNPfn countsDicts = defaultdict(int) set1 = set() utils.checkFileExistence(isoggRepSNPfn, 'First representative SNPs') with open(isoggRepSNPfn, 'r') as isoggRepSNPfile: for line in isoggRepSNPfile: countsDicts['lines'] += 1 snpAliasesString = line.strip().split()[1] if snpAliasesString != '.': countsDicts['haplogroups'] += 1 for snpAliases in snpAliasesString.split(','): countsDicts['snps'] += 1 for snpName in snpAliases.split('/'): set1.add(snpName) set2 = set() utils.checkFileExistence(otherRepSNPfn, 'Second representative SNPs') with open(otherRepSNPfn, 'r') as otherRepSNPfile: for line in otherRepSNPfile: set2.add(line.strip().split()[1]) self.representativeSNPnameSet = set1 | set2 self.errAndLog( 'Read representative SNPs\n' + \ '%6d haplogroups in: %s\n' % (countsDicts['lines'], isoggRepSNPfn) + \ '%6d haplogroups with at least one ISOGG-designated representative SNP\n' % \ countsDicts['haplogroups'] + \ '%6d SNPs, as some haplogroups have more than one representative\n' % \ countsDicts['snps'] + \ '%6d SNP names, including aliases\n' % len(set1) + \ '%6d additional representative SNPs read from: %s\n' % (len(set2), otherRepSNPfn) + \ '%6d total SNP names\n\n' % len(self.representativeSNPnameSet))