Python HDF5MatrixFileの例、pymodule.yhio.HDF5MatrixFile.HDF5MatrixFile Pythonの例

コード例 #1

0

ファイルを表示

ファイル: AbstractMatrixFileWalker.py プロジェクト: bopopescu/gwasmodules

	def _writeHeader(self, header=None, pdata=None, rowDefinition=None):
		"""
		2013.07.31
			called by processHeader() and others (in GenomeMovingAverageStatistics.py)
		"""
		if not self.invariantPData.headerOutputted:
			if self.outputFileFormat==1:
				if self.invariantPData.writer and header:
					self.invariantPData.writer.writerow(header)
			elif getattr(self, 'writer', None) is None and getattr(self.invariantPData, 'writer', None) is None:
				if self.outputFileFormat==2:
					if not rowDefinition and header:	#generate a rowDefinition based on header
						rowDefinition = []
						for colID in header:
							rowDefinition.append((colID, 's2000'))
					writer = YHFile(self.outputFname, openMode='w', rowDefinition=rowDefinition)
					self.invariantPData.writer = writer
				else:	#for HDF5MatrixFile
					if not rowDefinition and header:	#generate a rowDefinition based on header
						rowDefinition = []
						for colID in header:
							rowDefinition.append((colID, HDF5MatrixFile.varLenStrType))
					#rowDefinition = [('locus_id','i8'),('chromosome', HDF5MatrixFile.varLenStrType), ('start','i8'), ('stop', 'i8'), \
					#	('score', 'f8'), ('MAC', 'i8'), ('MAF', 'f8')]
					writer = HDF5MatrixFile(self.outputFname, openMode='w', rowDefinition=rowDefinition)
					self.invariantPData.writer = writer
			else:
				sys.stderr.write("\t Either self.writer %s, or self.invariantPData.writer %s already exists.\n"%\
								(getattr(self, 'writer', None), getattr(self.invariantPData, 'writer', None)))
				sys.stderr.write("\t no writer created in processHeader().\n")
		self.invariantPData.headerOutputted = True

コード例 #2

0

ファイルを表示

def outputAssociationLociInHDF5(associationLocusList=None, filename=None, writer=None, tableName='association_locus', \
     closeFile=True,\
     attributeDict=None):
    """
	2012.12.10
		for each locus, output the association peaks that fall into the locus.
			for each association peak, include 
				* result-id 
				* phenotype id
				* chromosome
				* start
				* stop
				* start_locus
				* stop_locus
				* no_of_loci
				* peak_locus
				* peak-score
	2012.11.20
	"""
    sys.stderr.write("Dumping association loci into %s (HDF5 format) ..." %
                     (filename))
    #each number below is counting bytes, not bits
    rowDefinition = [('chromosome', HDF5MatrixFile.varLenStrType), \
       ('start','i8'), ('stop', 'i8'), \
       ('no_of_peaks', 'i8'), ('connectivity', 'f8'), ('no_of_results', 'i8')]
    if writer is None and filename:
        writer = HDF5MatrixFile(filename,
                                openMode='w',
                                rowDefinition=rowDefinition,
                                tableName=tableName)
        tableObject = writer.getTableObject(tableName=tableName)
    elif writer:
        tableObject = writer.createNewTable(tableName=tableName,
                                            rowDefinition=rowDefinition)
    else:
        sys.stderr.write("Error: no writer(%s) or filename(%s) to dump.\n" %
                         (writer, filename))
        sys.exit(3)
    #add neighbor_distance, max_neighbor_distance, min_MAF, min_score, ground_score as attributes
    addAttributeDictToYHTableInHDF5Group(tableObject=tableObject,
                                         attributeDict=attributeDict)
    cellList = []
    #2012.11.28 sort it
    associationLocusList.sort()
    for associationLocus in associationLocusList:
        dataTuple = (associationLocus.chromosome, associationLocus.start, associationLocus.stop, associationLocus.no_of_peaks,\
           associationLocus.connectivity, associationLocus.no_of_results)
        cellList.append(dataTuple)

    if tableObject is None:
        sys.stderr.write(
            "Error: tableObject (name=%s) is None. could not write.\n" %
            (tableName))
        sys.exit(3)
    tableObject.writeCellList(cellList)
    if closeFile:
        writer.close()
    sys.stderr.write("%s objects.\n" % (len(cellList)))
    return writer

コード例 #3

0

ファイルを表示

def getAssociationLandscapeDataFromHDF5File(inputFname=None, associationTableName='association', \
          landscapeTableName='landscape', min_MAF=0.1):
    """
	2012.11.20
		input is in HDF5MatrixFile format (which is output of variation/src/association_peak/DefineAssociationLandscape.py)
		contains two hdf5 groups. one is by associationTableName. the other is by landscapeTableName.
	"""
    pdata = PassingData(min_MAF=min_MAF)
    genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(inputFname=inputFname, \
         min_value_cutoff=None, do_log10_transformation=False, pdata=pdata,\
         construct_chr_pos2index=False, construct_data_obj_id2index=False, \
         construct_locus_db_id2index=True,\
         report=True, tableName=associationTableName)

    returnData = PassingData(genome_wide_result=genome_wide_result)

    sys.stderr.write("Reading landscape from %s ..." % (inputFname))
    current_obj = None
    bridge_ls = []
    locusLandscapeNeighborGraph = nx.Graph()
    reader = HDF5MatrixFile(inputFname, openMode='r')
    landscapeTableObject = reader.getTableObject(tableName=landscapeTableName)
    returnData.HDF5AttributeNameLs = []
    for attributeName, value in landscapeTableObject.getAttributes().iteritems(
    ):
        returnData.HDF5AttributeNameLs.append(attributeName)
        setattr(returnData, attributeName, value)

    for row in landscapeTableObject:
        if row.start_locus_id == 0:  #empty data. happens when inputFname contains no valid landscape, but one default null data point.
            continue
        start_locus_id = row.start_locus_id
        stop_locus_id = row.stop_locus_id
        no_of_loci = row.no_of_loci
        deltaX = row.deltaX

        start_obj = genome_wide_result.get_data_obj_by_locus_db_id(
            start_locus_id)
        stop_obj = genome_wide_result.get_data_obj_by_locus_db_id(
            stop_locus_id)

        bridge_ls.append([start_obj, stop_obj, no_of_loci, deltaX])

        source_index = start_obj.index
        #genome_wide_result.get_data_obj_index_by_locus_db_id(start_locus_id)
        target_index = stop_obj.index

        locusLandscapeNeighborGraph.add_edge(source_index, target_index, \
               weight=None)
        locusLandscapeNeighborGraph[source_index][target_index][
            'no_of_loci'] = no_of_loci
        locusLandscapeNeighborGraph[source_index][target_index][
            'deltaX'] = deltaX

    del reader
    sys.stderr.write("%s bridges.\n" % (len(bridge_ls)))
    returnData.bridge_ls = bridge_ls
    returnData.locusLandscapeNeighborGraph = locusLandscapeNeighborGraph
    return returnData

コード例 #4

0

ファイルを表示

ファイル: AbstractMatrixFileWalker.py プロジェクト: bopopescu/gwasmodules

	def openOneInputFile(self, inputFname=None):
		"""
		2013.09.05 split out of fileWalker() , added VCFFile
		"""
		if self.inputFileFormat==2:	#2012.12.20
			reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName)
		elif self.inputFileFormat==3:	#2012.11.22
			reader = HDF5MatrixFile(inputFname, openMode='r')
		elif self.inputFileFormat==4:
			reader = VCFFile(inputFname=inputFname)
		else:
			reader = MatrixFile(inputFname)
		return reader

コード例 #5

0

ファイルを表示

def outputAssociationPeakInHDF5(association_peak_ls=None, filename=None, writer=None, tableName='association_peak', closeFile=True,\
       attributeDict=None,):
    """
	2012.11.20
	"""
    sys.stderr.write("Dumping association peaks into %s (HDF5 format) ..." %
                     (filename))
    #each number below is counting bytes, not bits
    rowDefinition = [('chromosome', HDF5MatrixFile.varLenStrType), ('start','i8'), ('stop', 'i8'), \
       ('start_locus_id','i8'), ('stop_locus_id','i8'), \
       ('no_of_loci', 'i8'), ('peak_locus_id', 'i8'), ('peak_score', 'f8')]
    if writer is None and filename:
        writer = HDF5MatrixFile(filename,
                                openMode='w',
                                rowDefinition=rowDefinition,
                                tableName=tableName)
        tableObject = writer.getTableObject(tableName=tableName)
    elif writer:
        tableObject = writer.createNewTable(tableName=tableName,
                                            rowDefinition=rowDefinition)
    else:
        sys.stderr.write("Error: no writer(%s) or filename(%s) to dump.\n" %
                         (writer, filename))
        sys.exit(3)
    #add neighbor_distance, max_neighbor_distance, min_MAF, min_score, ground_score as attributes
    addAttributeDictToYHTableInHDF5Group(tableObject=tableObject,
                                         attributeDict=attributeDict)
    cellList = []
    #2012.11.28 sort it
    association_peak_ls.sort()
    for association_peak in association_peak_ls:
        dataTuple = (association_peak.chromosome, association_peak.start, association_peak.stop, \
           association_peak.start_locus_id, association_peak.stop_locus_id, \
           association_peak.no_of_loci,\
           association_peak.peak_locus_id, association_peak.peak_score)
        cellList.append(dataTuple)

    if tableObject is None:
        sys.stderr.write(
            "Error: tableObject (name=%s) is None. could not write.\n" %
            (tableName))
        sys.exit(3)
    tableObject.writeCellList(cellList)
    if closeFile:
        writer.close()
    sys.stderr.write("%s objects.\n" % (len(cellList)))
    return writer

コード例 #6

0

ファイルを表示

def constructAssociationPeakRBDictFromHDF5File(inputFname=None,
                                               peakPadding=10000,
                                               tableName='association_peak'):
    """
	2012.11.12
		similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file
	"""
    from pymodule.algorithm.RBTree import RBDict
    from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio

    sys.stderr.write(
        "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..."
        % (inputFname, peakPadding))
    reader = HDF5MatrixFile(inputFname, openMode='r')
    associationPeakRBDict = RBDict()
    associationPeakRBDict.result_id = None  #2012.6.22
    associationPeakRBDict.peakPadding = peakPadding
    associationPeakRBDict.HDF5AttributeNameLs = []

    tableObject = reader.getTableObject(tableName=tableName)
    for attributeName, value in tableObject.getAttributes().iteritems():
        associationPeakRBDict.HDF5AttributeNameLs.append(attributeName)
        setattr(associationPeakRBDict, attributeName, value)

    counter = 0
    real_counter = 0
    for row in tableObject:
        if not row.chromosome:  #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
            continue
        counter += 1
        segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
            span_ls=[max(1, row.start - peakPadding), row.stop + peakPadding], \
            min_reciprocal_overlap=1, result_peak_id=None)
        #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
        if segmentKey not in associationPeakRBDict:
            associationPeakRBDict[segmentKey] = []
        else:
            sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\
                (row, associationPeakRBDict[segmentKey][0]))
        associationPeakRBDict[segmentKey].append(row)
    sys.stderr.write("%s peaks in %s spans.\n" %
                     (counter, len(associationPeakRBDict)))
    return associationPeakRBDict

コード例 #7

0

ファイルを表示

def outputAssociationLandscapeInHDF5(bridge_ls=None, outputFname=None, writer=None, closeFile=False, tableName='landscape',\
       attributeDict=None,):
    """
	2012.11.18
	"""
    sys.stderr.write("Outputting the %s bridges from the landscape ..." %
                     (len(bridge_ls)))
    #output the data_object.id in bridge_ls to outputFname
    #each number below is counting bytes, not bits
    rowDefinition = [('start_locus_id', 'i8'), ('stop_locus_id', 'i8'),
                     ('no_of_loci', 'i8'), ('deltaX', 'i8')]
    if writer:
        tableObject = writer.createNewTable(tableName=tableName,
                                            rowDefinition=rowDefinition)
    elif outputFname:
        writer = HDF5MatrixFile(outputFname,
                                openMode='w',
                                rowDefinition=rowDefinition,
                                tableName=tableName)
        tableObject = writer.getTableObject(tableName=tableName)
    else:
        sys.stderr.write("Error: no writer(%s) or filename(%s) to dump.\n" %
                         (writer, filename))
        sys.exit(3)
    addAttributeDictToYHTableInHDF5Group(tableObject=tableObject,
                                         attributeDict=attributeDict)

    previous_locus_id = None
    cellList = []
    for bridge in bridge_ls:
        current_obj = bridge[0]
        obj_with_fastest_score_increase = bridge[1]
        no_of_loci, deltaX = bridge[2:4]
        dataTuple = (current_obj.db_id, obj_with_fastest_score_increase.db_id,
                     no_of_loci, deltaX)
        cellList.append(dataTuple)
    tableObject.writeCellList(cellList)
    if closeFile:
        writer.close()
    sys.stderr.write("%s objects.\n" % (len(cellList)))
    return writer

コード例 #8

0

ファイルを表示

def constructAssociationLocusRBDictFromHDF5File(inputFname=None,
                                                locusPadding=0,
                                                tableName='association_locus'):
    """
	2012.11.25
		similar to constructAssociationPeakRBDictFromHDF5File
	"""
    from pymodule.algorithm.RBTree import RBDict
    from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio

    sys.stderr.write(
        "Constructing association-locus RBDict from HDF5 file %s, (locusPadding=%s) ..."
        % (inputFname, locusPadding))
    reader = HDF5MatrixFile(inputFname, openMode='r')
    associationLocusRBDict = RBDict()
    associationLocusRBDict.locusPadding = locusPadding
    associationLocusRBDict.HDF5AttributeNameLs = []
    tableObject = reader.getTableObject(tableName=tableName)
    for attributeName, value in tableObject.getAttributes().iteritems():
        associationLocusRBDict.HDF5AttributeNameLs.append(attributeName)
        setattr(associationLocusRBDict, attributeName, value)

    counter = 0
    real_counter = 0
    for row in tableObject:
        if not row.chromosome:  #empty chromosome, which happens when inputFname contains no valid locus, but the default null locus (only one).
            continue
        counter += 1
        segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
            span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \
            min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \
            no_of_results=row.no_of_results, connectivity=row.connectivity)
        #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
        if segmentKey not in associationLocusRBDict:
            associationLocusRBDict[segmentKey] = []
        associationLocusRBDict[segmentKey].append(row)
    sys.stderr.write("%s peaks in %s spans.\n" %
                     (counter, len(associationLocusRBDict)))
    return associationLocusRBDict