def __init__(self, inputFname=None, openMode='r', \
				tableName='association_locus', groupNamePrefix='group', tableNamePrefix='table',\
				filters=None, autoRead=True, autoWrite=True, \
				locus2PeakTableName='association_locus2peak', locusPadding=0, constructLocusRBDict=True,\
				**keywords):
		
		self.constructLocusRBDict = constructLocusRBDict
		self.locus2PeakTableName = locus2PeakTableName
		self.locusPadding = locusPadding
		self.associationLocusRBDict = None
		
		YHFile.__init__(self, inputFname=inputFname, openMode=openMode, \
				tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
				rowDefinition=None, filters=filters, debug=0, report=0,\
				autoRead=False, autoWrite=False)
		
		#to overwrite self.autoRead that is set by YHFile.__init__
		self.autoRead = autoRead
		self.autoWrite = autoWrite
		
		if self.autoRead and (self.openMode=='r' or self.openMode=='a'):
			self.associationLocusTable = self.getTableObject(tableName=self.tableName)
			self.associationLocus2PeakTable = self.getTableObject(tableName=self.locus2PeakTableName)
			if self.constructLocusRBDict:
				self.associationLocusRBDict = self._readInData(tableName=self.tableName, tableObject=self.associationLocusTable)
		elif openMode == 'w':
			self.associationLocusTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationLocusTable,\
													expectedrows=50000)
			self.associationLocus2PeakTable = self.createNewTable(tableName=self.locus2PeakTableName, \
													rowDefinition=AssociationLocus2PeakTable, expectedrows=500000)
    def __init__(self, inputFname=None, openMode='r', \
       tableName='association_landscape', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, autoRead=True, autoWrite=True, \
       min_MAF=0.1, associationTableName='association', **keywords):

        self.associationTableName = associationTableName
        self.min_MAF = min_MAF

        self.bridge_ls = None
        self.locusLandscapeNeighborGraph = None

        YHFile.__init__(self, inputFname=inputFname, openMode=openMode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=None, filters=filters, \
          debug=0, report=0, autoRead=False, autoWrite=False)

        #to overwrite self.autoRead that is set by YHFile.__init__
        self.autoRead = autoRead
        self.autoWrite = autoWrite

        if self.autoRead and (self.openMode == 'r' or self.openMode == 'a'):
            self.associationLandscapeTable = self.getTableObject(
                tableName=self.tableName)
            self.associationTable = self.getTableObject(
                tableName=self.associationTableName)
            self._readInData(tableName=self.tableName,
                             tableObject=self.associationLandscapeTable)
        if self.autoWrite and self.openMode == 'w':
            self.associationLandscapeTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationLandscapeTable,\
                       expectedrows=50000)
            self.associationTable = self.createNewTable(tableName=self.associationTableName, rowDefinition=AssociationTable,\
                      expectedrows=300000)
	def _readInData(self, tableName=None, tableObject=None, bugfixType=None):
		"""
		2013.1.28 added argument bugfixType (default is None)
			1: swap stop & no_of_peaks, an earlier bug exchanged the positions of the two.
		2013.1.26 added phenotype_id_set in the node
		2012.11.25
			similar to constructAssociationPeakRBDictFromHDF5File
		"""
		if tableName is None:
			tableName = self.tableName
		YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
		if not self.constructLocusRBDict:
			return
		
		locusPadding = self.locusPadding
		sys.stderr.write("Constructing association-locus RBDict (locusPadding=%s) ..."%(locusPadding))
		if tableObject is None:
			tableObject = self.getTableObject(tableName=tableName)
		associationLocusRBDict = RBDict()
		associationLocusRBDict.locusPadding = locusPadding
		associationLocusRBDict.HDF5AttributeNameLs = []
		
		for attributeName, value in tableObject.getAttributes().iteritems():
			associationLocusRBDict.HDF5AttributeNameLs.append(attributeName)
			setattr(associationLocusRBDict, attributeName, value)
		
		counter = 0
		real_counter = 0
		for rowPointer in tableObject:
			row = castPyTablesRowIntoPassingData(rowPointer)
			if not row.chromosome:	#empty chromosome, which happens when inputFname contains no valid locus, but the default null locus (only one).
				continue
			counter += 1
			phenotype_id_ls = row.phenotype_id_ls_in_str.split(',')
			phenotype_id_set = set(map(int, phenotype_id_ls))
			if bugfixType==1:
				#2013.1.28 old association-loci file have two columns swapped. run this to correct it.
				# a function in variation/src/misc.py is written:
				#	DB250k.correctAssociationLocusFileFormat(db_250k=db_250k, data_dir=None)
				rowPointer['stop'] = row.no_of_peaks
				rowPointer['no_of_peaks'] = row.stop
				rowPointer.update()
				row.no_of_peaks = rowPointer['no_of_peaks']
				row.stop = rowPointer['stop']
			segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
							span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \
							min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \
							no_of_results=row.no_of_results, connectivity=row.connectivity,\
							phenotype_id_set=phenotype_id_set, locus_id=row.id)
							#2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
			if segmentKey not in associationLocusRBDict:
				associationLocusRBDict[segmentKey] = []
			associationLocusRBDict[segmentKey].append(row)
		sys.stderr.write("%s peaks in %s spans.\n"%(counter, len(associationLocusRBDict)))
		self.associationLocusRBDict = associationLocusRBDict
		return associationLocusRBDict
    def __init__(self, inputFname=None, openMode='r', \
       tableName='association_peak', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, peakPadding=0, expectedrows=50000, autoRead=True, autoWrite=True, \
       **keywords):

        self.peakPadding = peakPadding
        self.associationPeakRBDict = None
        YHFile.__init__(self, inputFname=inputFname, openMode=openMode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=AssociationPeakTable, filters=filters, expectedrows=expectedrows,\
          autoRead=autoRead, autoWrite=autoWrite,\
          debug=0, report=0)

        self.associationPeakTable = self.getTableObject(
            tableName=self.tableName)
	def _writeHeader(self, header=None, pdata=None, rowDefinition=None):
		"""
		2013.07.31
			called by processHeader() and others (in GenomeMovingAverageStatistics.py)
		"""
		if not self.invariantPData.headerOutputted:
			if self.outputFileFormat==1:
				if self.invariantPData.writer and header:
					self.invariantPData.writer.writerow(header)
			elif getattr(self, 'writer', None) is None and getattr(self.invariantPData, 'writer', None) is None:
				if self.outputFileFormat==2:
					if not rowDefinition and header:	#generate a rowDefinition based on header
						rowDefinition = []
						for colID in header:
							rowDefinition.append((colID, 's2000'))
					writer = YHFile(self.outputFname, openMode='w', rowDefinition=rowDefinition)
					self.invariantPData.writer = writer
				else:	#for HDF5MatrixFile
					if not rowDefinition and header:	#generate a rowDefinition based on header
						rowDefinition = []
						for colID in header:
							rowDefinition.append((colID, HDF5MatrixFile.varLenStrType))
					#rowDefinition = [('locus_id','i8'),('chromosome', HDF5MatrixFile.varLenStrType), ('start','i8'), ('stop', 'i8'), \
					#	('score', 'f8'), ('MAC', 'i8'), ('MAF', 'f8')]
					writer = HDF5MatrixFile(self.outputFname, openMode='w', rowDefinition=rowDefinition)
					self.invariantPData.writer = writer
			else:
				sys.stderr.write("\t Either self.writer %s, or self.invariantPData.writer %s already exists.\n"%\
								(getattr(self, 'writer', None), getattr(self.invariantPData, 'writer', None)))
				sys.stderr.write("\t no writer created in processHeader().\n")
		self.invariantPData.headerOutputted = True
    def _readInData(self, tableName=None, tableObject=None):
        """
		2012.11.12
			similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file
		"""
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)

        from pymodule.algorithm.RBTree import RBDict
        from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio
        if tableObject is None:
            tableObject = self.getTableObject(tableName=tableName)
        sys.stderr.write(
            "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..."
            % (self.inputFname, self.peakPadding))
        associationPeakRBDict = RBDict()
        associationPeakRBDict.result_id = None  #2012.6.22
        associationPeakRBDict.peakPadding = self.peakPadding
        associationPeakRBDict.HDF5AttributeNameLs = []

        for attributeName, value in self.getAttributes().iteritems():
            associationPeakRBDict.HDF5AttributeNameLs.append(attributeName)
            setattr(associationPeakRBDict, attributeName, value)

        counter = 0
        real_counter = 0
        for row in tableObject:
            if not row[
                    'chromosome']:  #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
                continue
            counter += 1
            segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row['chromosome'], \
                span_ls=[max(1, row['start'] - self.peakPadding), row['stop'] + self.peakPadding], \
                min_reciprocal_overlap=1, result_peak_id=None)
            #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
            if segmentKey not in associationPeakRBDict:
                associationPeakRBDict[segmentKey] = []
            else:
                sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\
                    (row, associationPeakRBDict[segmentKey][0]))
            associationPeakRBDict[segmentKey].append(
                castPyTablesRowIntoPassingData(
                    row))  #row is a pointer to the current row.
        sys.stderr.write("%s peaks in %s spans.\n" %
                         (counter, len(associationPeakRBDict)))

        self.associationPeakRBDict = associationPeakRBDict
        return self.associationPeakRBDict
Beispiel #7
0
    def __init__(self, inputFname=None, openMode='r', \
       tableName='locus_map', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, expectedrows=500000, autoRead=True, autoWrite=True, \
       **keywords):

        self.locus_id2chr_pos = None
        YHFile.__init__(self, inputFname=inputFname, openMode=openMode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=LocusMapTable, filters=filters, expectedrows=expectedrows,\
          autoRead=autoRead, autoWrite=autoWrite,\
          debug=0, report=0, **keywords)

        #if (openMode=='r' or openMode == 'a')  and self.readInData:
        #	self.locusMapTable = self.getTableObject(tableName=self.tableName)
        #	self._readInMap(tableObject=self.locusMapTable)
        #elif openMode == 'w':
        #	self.locusMapTable = self.createNewTable(tableName=self.tableName, rowDefinition=LocusMapTable,\
        #										expectedrows=500000)
        self.locusMapTable = self.getTableObject(tableName=self.tableName)
Beispiel #8
0
    def _readInData(self,
                    tableName=None,
                    tableObject=None,
                    do_log10_transformation=None):
        """
		"""
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)

        if tableName is None:
            tableName = self.tableName
        if do_log10_transformation is None:
            do_log10_transformation = getattr(self, 'do_log10_transformation',
                                              False)
        pdata = PassingData(min_MAF=self.min_MAF)
        self.genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(reader=self, tableName=tableName, tableObject=tableObject,\
             min_value_cutoff=None, do_log10_transformation=do_log10_transformation, pdata=pdata,\
             construct_chr_pos2index=False, construct_data_obj_id2index=False, \
             construct_locus_db_id2index=True,\
             report=True)
        return self.genome_wide_result
	def openOneInputFile(self, inputFname=None):
		"""
		2013.09.05 split out of fileWalker() , added VCFFile
		"""
		if self.inputFileFormat==2:	#2012.12.20
			reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName)
		elif self.inputFileFormat==3:	#2012.11.22
			reader = HDF5MatrixFile(inputFname, openMode='r')
		elif self.inputFileFormat==4:
			reader = VCFFile(inputFname=inputFname)
		else:
			reader = MatrixFile(inputFname)
		return reader
Beispiel #10
0
    def __init__(self, inputFname=None, openMode='r', \
       tableName='association', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, expectedrows=300000, autoRead=True, autoWrite=True, \
       min_MAF=None, do_log10_transformation=False, **keywords):
        self.min_MAF = min_MAF
        self.genome_wide_result = None
        self.associationTable = None
        self.do_log10_transformation = do_log10_transformation

        YHFile.__init__(self, inputFname=inputFname, openMode=openMode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=AssociationTable, filters=filters, expectedrows=expectedrows,\
          autoRead=autoRead, autoWrite=autoWrite,\
          debug=0, report=0, )
        """
		if openMode=='r' and self.readInData:
			self.associationTable = self.getTableObject(tableName=self.tableName)
			self._readInGWR(min_MAF=self.min_MAF, tableObject=self.associationTable)
		elif openMode=='w':
			self.associationTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationTable, \
											expectedrows=300000)
		"""
        self.associationTable = self.getTableObject(tableName=self.tableName)
Beispiel #11
0
    def _readInData(self, tableName=None, tableObject=None):
        """
		2012.1.9
		"""
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)

        if tableObject is None:
            tableObject = self.getTableObject(tableName=tableName)

        sys.stderr.write("Reading the locus map from HDF5 file %s ..." %
                         (self.inputFname))
        """
		for attributeName, value in self.getAttributes().iteritems():
			HDF5AttributeNameLs.append(attributeName)
			setattr(, attributeName, value)
		"""
        counter = 0
        real_counter = 0
        self.locus_id2chr_pos = {}
        for row in tableObject:
            if not row[
                    'chromosome']:  #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
                continue
            counter += 1
            chr_pos = (row['chromosome'], row['start'], row['stop'])
            locus_id = row['locus_id']
            if locus_id not in self.locus_id2chr_pos:
                self.locus_id2chr_pos[locus_id] = chr_pos
                real_counter += 1
            else:
                chr_pos = self.locus_id2chr_pos[locus_id]
                sys.stderr.write("Warning: locus_id %s is already in locus_id2chr_pos with chr,start,stop(%s, %s, %s).\n"%\
                    (locus_id, chr_pos[0], chr_pos[1], chr_pos[2]))
        sys.stderr.write("%s loci (%s total) with unique locus_id.\n" %
                         (real_counter, counter))
        return self.locus_id2chr_pos
Beispiel #12
0
    def _readInData(self, tableName=None, tableObject=None, bugfixType=None):
        """
		2013.3.6
		"""
        if tableName is None:
            tableName = self.tableName
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
        if not self.constructSNPData:
            return

        sys.stderr.write("Reading everything into a SNPData structure ...")
        row_id_list = []
        row_id_number2row_index = {}
        col_id_list = []
        col_id_number2col_index = {}
        for row in self.individualTable:
            row_id_list.append(row.name)
            row_id_number2row_index[row.id] = len(row_id_list) - 1
        for row in self.locusTable:
            #col_id_list.append(row.id)
            col_id_list.append((row.chromosome_id, row.start, row.stop))
            col_id_number2col_index[row.id] = len(col_id_list) - 1

        allele_sequence2allele_number = {}
        allele_number2allele_sequence = {}

        #each cell in data_matrix is an array of alleles for one individual at one locus, but different chromosomes
        # alleles are encoded in numbers starting from 1. 0 is missing.
        data_matrix = numpy.zeros(
            [len(row_id_list), len(col_id_list), self.ploidy],
            dtype=numpy.int16)

        if self.ploidy > 1:
            #chromosome_copy_matrix is used to keep track of the chromosomes for particular individual & locus
            chromosome_copy_matrix = numpy.zeros(
                [len(row_id_list), len(col_id_list)], dtype=numpy.int8)
        else:
            chromosome_copy_matrix = None

        for row in self.polymorphismTable:
            row_index = row_id_number2row_index.get(row.individual_id)
            col_index = col_id_number2col_index.get(row.locus_id)

            #figure out which chromosome to hold this allele
            if self.ploidy > 1:
                chromosome_copy_matrix[row_index][
                    col_index] = chromosome_copy_matrix[row_index][
                        col_index] + 1
                if row.chromosome_copy == 0:  #unphased genotype
                    chromosome_copy_index = chromosome_copy_matrix[row_index][
                        col_index] - 1
                else:
                    chromosome_copy_index = row.chromosome_copy - 1
            else:
                chromosome_copy_index = 0
                if row.chromosome_copy > 1:
                    sys.stderr.write("Warning: ploidy=%s, but encounter chromosome_copy (%s) >1.\n"%\
                        (self.ploidy, row.chromosome_copy))

            #allele_number starts from 1. 0 is reserved for missing.
            if row.allele_sequence not in allele_sequence2allele_number:
                allele_sequence2allele_number[row.allele_sequence] = len(
                    allele_sequence2allele_number) + 1
                allele_number = allele_sequence2allele_number.get(
                    row.allele_sequence)
                allele_number2allele_sequence[
                    allele_number] = row.allele_sequence

            allele_number = allele_sequence2allele_number.get(
                row.allele_sequence)
            data_matrix[row_index][col_index][
                chromosome_copy_index] = allele_number
        self.snpData = SNPData(row_id_list=row_id_list,
                               col_id_list=col_id_list,
                               data_matrix=data_matrix)

        self.snpData.allele_sequence2allele_number = allele_sequence2allele_number
        self.snpData.allele_number2allele_sequence = allele_number2allele_sequence
        sys.stderr.write(" %s individuals, %s loci, ploidy=%s, isPhased=%s.\n"%(len(self.snpData.row_id_ls),\
                       len(self.snpData.col_id_ls), \
                       self.ploidy, self.isPhased))

        return self.snpData
Beispiel #13
0
    def __init__(self, inputFname=None, openMode='r', \
       tableName='polymorphism', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, autoRead=True, autoWrite=True, \
       isPhased=None, ploidy=None, constructSNPData=True, **keywords):

        self.bridge_ls = None
        self.locusLandscapeNeighborGraph = None

        YHFile.__init__(self, inputFname=inputFname, openMode=openMode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=None, filters=filters, \
          debug=0, report=0, autoRead=False, autoWrite=False)

        self.speciesTableName = 'species'
        self.populationTableName = 'population'
        self.individualTableName = "individual"
        self.chromosomeTableName = 'chromosome'
        self.locusTableName = 'locus'
        self.recombinationTableName = 'recombination'

        self.isPhased = isPhased
        self.ploidy = ploidy
        self.constructSNPData = constructSNPData

        #to overwrite self.autoRead that is set by YHFile.__init__
        self.autoRead = autoRead
        self.autoWrite = autoWrite

        self.snpData = None  #the SNPData structure that holds all polymorphism, locus, individual info

        if self.autoRead and (self.openMode == 'r' or self.openMode == 'a'):
            self.speciesTable = self.getTableObject(
                tableName=self.speciesTableName)
            self.populationTable = self.getTableObject(
                tableName=self.populationTableName)
            self.individualTable = self.getTableObject(
                tableName=self.individualTableName)
            self.chromosomeTable = self.getTableObject(
                tableName=self.chromosomeTableName)
            self.locusTable = self.getTableObject(
                tableName=self.locusTableName)
            self.recombinationTable = self.getTableObject(
                tableName=self.recombinationTableName)
            self.polymorphismTable = self.getTableObject(
                tableName=self.tableName)

            #read the isPhased, ploidy from pytables attributes, overwrites the arguments
            self.isPhased = self.polymorphismTable.getAttribute(
                name='isPhased', defaultValue=0)
            self.ploidy = self.polymorphismTable.getAttribute(name='ploidy',
                                                              defaultValue=2)

            self._readInData(tableName=self.tableName,
                             tableObject=self.associationLandscapeTable)
        if self.autoWrite and self.openMode == 'w':
            self.speciesTable = self.createNewTable(tableName=self.speciesTableName, rowDefinition=SpeciesTable,\
                      expectedrows=500)
            self.populationTable = self.createNewTable(tableName=self.populationTableName, rowDefinition=PopulationTable,\
                      expectedrows=500)
            self.individualTable = self.createNewTable(tableName=self.individualTableName, rowDefinition=IndividualTable,\
                      expectedrows=30000)
            self.chromosomeTable = self.createNewTable(tableName=self.chromosomeTableName, rowDefinition=ChromosomeTable,\
                      expectedrows=500)
            self.locusTable = self.createNewTable(tableName=self.locusTableName, rowDefinition=LocusTable,\
                     expectedrows=300000)
            self.recombinationTable = self.createNewTable(tableName=self.recombinationTableName, rowDefinition=RecombinationTable,\
                     expectedrows=300000)
            self.polymorphismTable = self.createNewTable(tableName=self.tableName, rowDefinition=PolymorphismTable,\
                     expectedrows=500000)
            #set the attributes of isPhased, ploidy
            self.polymorphismTable.addAttribute(name='isPhased',
                                                value=self.isPhased,
                                                overwrite=True)
            self.polymorphismTable.addAttribute(name='ploidy',
                                                value=self.ploidy,
                                                overwrite=True)

        #2013.3.8 these dictionaries are for outputting purposes
        self._individualName2ID = {}
        self._locus_index2id = {}

        #2013.3.8 helper structures
        self._locusStartPositionList = None
        self._locusChrStartStopList = None