Exemple #1
0
    def buildTreeFromNewick(self):
        '''
        Reads in a Newick-formatted tree, strips out bootstraps, 
        tokenizes it, and initiates tree building.
        Returns a node instance: the root.
        '''
        
        utils.checkFileExistence(self.config.primaryTreeFN, 'Primary tree')
        with open(self.config.primaryTreeFN, 'r') as treeFile:
            treeString = treeFile.readline().strip()
        self.errAndLog('\n%sRead primary tree topology:\n    %s\n\n' % \
                       (utils.DASHES, self.config.primaryTreeFN))

        '''
        Tokenization:
            a. strip out bootstraps: text within brackets
            b. split on any semantic token: [%s]
            c. but group to retain retain tokens themselves: ()
            d. then drop empty tokens from splitting adjacent semantic tokens
        '''
        treeString = re.subn(r'\[.*?\]', '', treeString)[0]
        treeList = re.split('([%s])' % self.config.newickSemanticTokenString, treeString)
        treeList = [token for token in treeList if token is not '']
        treeDeque = deque(treeList)
    
        hasLengths = ':' in treeDeque  # determine whether tree has lengths
        root = self.addChildSubtreeFromNewickDeque(None, treeDeque, hasLengths)
        root.writeNewick(self.config.alignedPrimaryTreeFN, alignTips=True)
        
        return root
Exemple #2
0
    def generateResidList():
        '''
        4 possibilities:
            residList specified at config instantiation
            -a -s RESID           -> a single research ID has been specified
            -i FILENAME.resid.txt -> read research IDs from file
            -a                    -> return empty list to indicate no subsetting
        '''

        residList = list()
        if Sample.config.residList:
            residList = Sample.config.residList
            Sample.errAndLog('Research ID list supplied.\n' + \
                '    %8d resids (%d unique)\n\n' % (len(residList), len(set(residList))))
        elif Sample.args.singleSampleID:
            resid = Customer.generateResid(Sample.args.singleSampleID)
            residList = [resid]
            Sample.errAndLog('Will call haplogroup for:\n    %d\n\n' % resid)
        elif Sample.args.dataFN:
            utils.checkFileExistence(Sample.args.dataFN, 'Research IDs')
            Sample.errAndLog('Reading research IDs:\n    %s\n' % Sample.args.dataFN)
            with open(Sample.args.dataFN, 'r') as residFile:
                for line in residFile:
                    ID = line.strip().split()[0]
                    residList.append(Customer.generateResid(ID))
                    
            Sample.errAndLog('    %8d read\n'     % len(residList))
            Sample.errAndLog('    %8d unique\n\n' % len(set(residList)))

        return residList
Exemple #3
0
    def parseIsoggTable(self):
        'parses ISOGG table'

        # input reader
        utils.checkFileExistence(self.config.isoggFN, 'Isogg')
        isoggInFile = open(self.config.isoggFN, 'r')
        isoggReader = csv.reader(isoggInFile, delimiter='\t')
        isoggReader.next()  # ignore header

        # output file handles
        if self.config.suppressOutputAndLog:
            isoggOutFile = None
            isoggDropOutFile = None
        else:
            isoggOutFile = open(self.config.cleanedIsoggFN, 'w')
            isoggDropOutFile = open(self.config.droppedIsoggFN, 'w')

        droppedMarkerList = list()

        for lineList in isoggReader:
            self.isoggCountsDict['read'] += 1

            # clean up data row and extract values
            lineList = [element.strip() for element in lineList]
            if lineList[
                    1] == '':  # when present, remove extra tab after snp name
                del lineList[1]
            if len(lineList) != 6:
                self.isoggCountsDict['badLines'] += 1
                continue
            name, haplogroup, _, _, position, mutation = lineList

            # apply corrections
            if name in self.isoggCorrectionDict:
                haplogroup, position, mutation = self.isoggCorrectionDict[name]
                self.numSNPsCorrected += 1

            # identify markers to drop
            recordIsBad, markerIsOkToRepresentNode = \
                self.checkIsoggRecord(name, haplogroup, position, mutation)
            if recordIsBad:
                self.isoggCountsDict['dropped'] += 1
                if isoggDropOutFile:
                    isoggDropOutFile.write('%-10s %-25s %8s %s\n' % \
                        (name, haplogroup, position, mutation))
                if markerIsOkToRepresentNode:
                    droppedMarkerList.append(DroppedMarker(name, haplogroup))
                continue

            # process retained SNPs
            self.isoggCountsDict['retained'] += 1
            position = int(position)
            if isoggOutFile:
                isoggOutFile.write('%-10s %-25s %8d %s\n' % \
                                   (name, haplogroup, position, mutation))
            self.constructSNP(name, haplogroup, position, mutation)

        self.addDroppedMarkersToNodes(droppedMarkerList)
        utils.closeFiles([isoggInFile, isoggOutFile, isoggDropOutFile])
Exemple #4
0
 def readPreferredSNPnameSet(self):
     '''reads a set of widely known SNP names. presence on this list is 
         the primary selection criterion for SNP labels'''
     
     preferredSNPnamesFN = self.config.preferredSNPnamesFN
     
     utils.checkFileExistence(preferredSNPnamesFN, 'Preferred SNP names')
     with open(preferredSNPnamesFN, 'r') as preferredSNPnamesFile:
         for line in preferredSNPnamesFile:
             self.preferredSNPnameSet.add(line.strip())
             
     self.errAndLog( 
         '%sRead preferred SNP names\n' % utils.DASHES + \
         '%6d SNP names: %s\n\n' % \
             (len(self.preferredSNPnameSet), preferredSNPnamesFN))
Exemple #5
0
    def importPrevCalledHaplogroups():
        '''
        reads file with previously called haplogroups, 
        assuming first col = ID & last col = haplogroup
        '''
        
        utils.checkFileExistence(Sample.config.prevCalledHgFN, 'Previously called haplogroups')
        with open(Sample.config.prevCalledHgFN, 'r') as prevCalledHgFile:
            for line in prevCalledHgFile:
                lineList = line.strip().split()
                ID, prevCalledHaplogroup = lineList[0], lineList[-1]
                Sample.prevCalledHaplogroupDict[ID] = prevCalledHaplogroup

        Sample.errAndLog('%sRead previously called haplogroups:\n    %s\n\n' % \
                       (utils.DASHES, Sample.config.prevCalledHgFN))
Exemple #6
0
    def buildCustomerTupleListFromFile():
        '''
        builds a list of CustomerTuple instances from a two-column file.
        
        column 1: ID
        column 2: comma-separated list of platforms for this individual
        
        example:  Sample314159 1,2,5
        '''

        utils.checkFileExistence(Sample.args.dataFN, 'Sample IDs')
        Sample.errAndLog('Reading sample IDs:\n    %s\n' % Sample.args.dataFN)

        customerTupleList = list()
        IDset = set()
        with open(Sample.args.dataFN, 'r') as idFile:
            for line in idFile:
                tokenList = line.strip().split()
                if len(tokenList) != 2:
                    sys.exit(
                        'ERROR. When specifying non-default ablock dataset,\n'
                        +
                        'ID file must have 2 columns: ID, comma-separated list of integers\n'
                        + 'indicating platform versions.\n')

                ID, platformVersions = tokenList
                IDset.add(ID)
                tupleKwargsDict = {
                    'resid': ID,
                    'y_haplogroup': Sample.config.
                    missingHaplogroup,  # previous call; not needed
                }

                platformVersionsSet = set(
                    [int(i) for i in platformVersions.split(',')])
                for i in xrange(1, Sample.config.maxPlatformVersionPlusOne):
                    tupleKwargsDict['is_v%d' % i] = i in platformVersionsSet

                customerTuple = Sample.config.CustomerTuple(**tupleKwargsDict)
                customerTupleList.append(customerTuple)

        Sample.errAndLog('    %8d read\n' % len(customerTupleList))
        Sample.errAndLog('    %8d unique\n\n' % len(IDset))

        return customerTupleList
Exemple #7
0
    def buildPageDict():
        '''
        builds a dictionary of 23andMe content pages. pagesFN comes from these two gdocs:
        - https://docs.google.com/spreadsheets/d/1mf86slweZEKUd5hzG2GmKGTGIpHuDipJz2u221y2zVE/edit?ts=568eb997#gid=0
        - https://docs.google.com/spreadsheets/d/1oo0sRmYFNeWikuOxcb_1obOoO35wQccmOzyGRmqDMtc/edit?ts=578578d0#gid=362797346
        '''

        utils.checkFileExistence(Node.config.pagesFN, 'Content pages')
        with open(Node.config.pagesFN, 'r') as pagesFile:
            pagesFile.readline()  # header
            for line in pagesFile:
                yccOld, snpName = line.strip().split()
                page = Page(yccOld, snpName)
                Node.pageList.append(page)

                if yccOld == Node.config.rootHaplogroup:
                    Node.pageDict[Node.config.rootHaplogroup] = page
                elif snpName != '.':
                    Node.pageDict[snpName] = page
Exemple #8
0
 def readRepresentativeSNPnameSet(self):
     'reads the names of SNPs deemed representative for their respective lineages'
     
     isoggRepSNPfn = self.config.isoggRepSNPfn
     otherRepSNPfn = self.config.otherRepSNPfn
     countsDicts = defaultdict(int)
     
     set1 = set()
     utils.checkFileExistence(isoggRepSNPfn, 'First representative SNPs')
     with open(isoggRepSNPfn, 'r') as isoggRepSNPfile:
         for line in isoggRepSNPfile:
             countsDicts['lines'] += 1
             snpAliasesString = line.strip().split()[1]
             if snpAliasesString != '.':
                 countsDicts['haplogroups'] += 1
                 for snpAliases in snpAliasesString.split(','):
                     countsDicts['snps'] += 1
                     for snpName in snpAliases.split('/'):
                         set1.add(snpName)
     
     set2 = set()
     utils.checkFileExistence(otherRepSNPfn, 'Second representative SNPs')
     with open(otherRepSNPfn, 'r') as otherRepSNPfile:
         for line in otherRepSNPfile:
             set2.add(line.strip().split()[1])
     
     self.representativeSNPnameSet = set1 | set2
     self.errAndLog( 
         'Read representative SNPs\n' + \
         '%6d haplogroups in: %s\n' % (countsDicts['lines'], isoggRepSNPfn) + \
         '%6d haplogroups with at least one ISOGG-designated representative SNP\n' % \
             countsDicts['haplogroups'] + \
         '%6d SNPs, as some haplogroups have more than one representative\n' % \
             countsDicts['snps'] + \
         '%6d SNP names, including aliases\n' % len(set1) + \
         '%6d additional representative SNPs read from: %s\n' % (len(set2), otherRepSNPfn) + \
         '%6d total SNP names\n\n' % len(self.representativeSNPnameSet))