def checkIfEdgeIdsExist(genome, trackName, allowOverlaps):
        collector = PreProcMetaDataCollector(genome, trackName)
        if not collector.getTrackFormat().isLinked():
            return

        uniqueIds = numpy.array([], dtype='S')
        uniqueEdgeIds = numpy.array([], dtype='S')

        for chr in collector.getPreProcessedChrs(allowOverlaps):
            trackSource = TrackSource()
            trackData = trackSource.getTrackData(trackName, genome, chr,
                                                 allowOverlaps)
            uniqueIds = numpy.unique(
                numpy.concatenate((uniqueIds, trackData['id'][:])))
            uniqueEdgeIds = numpy.unique(
                numpy.concatenate(
                    (uniqueEdgeIds, trackData['edges'][:].flatten())))

        uniqueIds = uniqueIds[uniqueIds != '']
        uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != '']

        unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds)
        if len(unmatchedIds) > 0:
            raise InvalidFormatError(
                "Error: the following ids specified in the 'edges' column do not exist in the dataset: "
                + ', '.join(sorted(unmatchedIds)))
Beispiel #2
0
 def checkUndirectedEdges(genome, trackName, allowOverlaps):
     collector = PreProcMetaDataCollector(genome, trackName)
     if not (collector.getTrackFormat().isLinked() and collector.hasUndirectedEdges()):
         return
     
     complementEdgeWeightDict = {}
     
     for chr in collector.getPreProcessedChrs(allowOverlaps):
         trackSource = TrackSource()
         trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps)
         
         ids = trackData['id']
         edges = trackData['edges']
         weights = trackData.get('weights')
         
         for i, id in enumerate(ids):
             edgesAttr = edges[i][edges[i] != '']
             weightsAttr = weights[i][edges[i] != ''] if weights is not None else None
             PreProcessUtils._adjustComplementaryEdgeWeightDict(complementEdgeWeightDict, id, edgesAttr, weightsAttr)
     
     if len(complementEdgeWeightDict) != 0:
             unmatchedPairs = []
             for toId in complementEdgeWeightDict:
                 for fromId in complementEdgeWeightDict[toId]:
                     unmatchedPairs.append((fromId, toId, complementEdgeWeightDict[toId][fromId]))
             raise InvalidFormatError("Error: All edges are not undirected. The following edges specifications " +\
                                      "are not matched by an opposite edge with equal weight:" + os.linesep +\
                                      os.linesep.join(["from '%s' to '%s'" % (fromId, toId) + \
                                                       (" with weight '%s'" % weight  if weight != '' else '') \
                                                       for fromId, toId, weight in unmatchedPairs]))
    def removeOutdatedPreProcessedFiles(cls, genome, trackName, allowOverlaps,
                                        mode):
        collector = PreProcMetaDataCollector(genome, trackName)
        if cls.preProcFilesExist(genome, trackName, allowOverlaps) and not \
            collector.hasRemovedPreProcFiles(allowOverlaps):
            dirPath = createDirPath(trackName,
                                    genome,
                                    allowOverlaps=allowOverlaps)

            assert (dirPath.startswith(PROCESSED_DATA_PATH))
            if mode == 'Real':
                print 'Removing outdated preprocessed data: ', dirPath
                for fn in os.listdir(dirPath):
                    fullFn = os.path.join(dirPath, fn)
                    if os.path.isfile(fullFn):
                        os.unlink(fullFn)
                    if os.path.isdir(fullFn):
                        if cls._isOldTypeChromDirectory(fullFn, genome):
                            shutil.rmtree(fullFn)
            else:
                print 'Would now have removed outdated preprocessed data if real run: ', dirPath

            collector.updateRemovedPreProcFilesFlag(allowOverlaps, True)

        if mode == 'Real':
            ti = TrackInfo(genome, trackName)
            ti.resetTimeOfPreProcessing()
    def merge(genome, trackName, allowOverlaps):
        path = createDirPath(trackName, genome, allowOverlaps=allowOverlaps)

        collector = PreProcMetaDataCollector(genome, trackName)
        chrList = collector.getPreProcessedChrs(allowOverlaps)
        if not collector.getTrackFormat().reprIsDense():
            chrList = sorted(chrList)

        existingChrList = [
            chr
            for chr in ChrMemmapFolderMerger._existingChrIter(path, chrList)
        ]
        if len(existingChrList) == 0:
            raise EmptyGESourceError(
                'No data lines has been read from source file (probably because it is empty).'
            )

        firstChrTrackData = TrackSource().getTrackData(trackName,
                                                       genome,
                                                       existingChrList[0],
                                                       allowOverlaps,
                                                       forceChrFolders=True)
        arrayList = firstChrTrackData.keys()
        for arrayName in arrayList:
            mergedArray = firstChrTrackData[arrayName][:]
            elementDim, dtypeDim = parseMemmapFileFn(
                firstChrTrackData[arrayName].filename)[1:3]
            del firstChrTrackData[arrayName]

            for chr in existingChrList[1:]:
                chrTrackData = TrackSource().getTrackData(trackName,
                                                          genome,
                                                          chr,
                                                          allowOverlaps,
                                                          forceChrFolders=True)

                mergedArray = ChrMemmapFolderMerger.mergeArrays(
                    mergedArray, np.array(chrTrackData[arrayName][:]))
                elementDimNew, dtypeDimNew = parseMemmapFileFn(
                    chrTrackData[arrayName].filename)[1:3]
                elementDim = max(elementDim, elementDimNew)
                dtypeDim = max(dtypeDim, dtypeDimNew)

                del chrTrackData[arrayName]

            mergedFn = createMemmapFileFn(path, arrayName, elementDim,
                                          dtypeDim, str(mergedArray.dtype))

            f = np.memmap(mergedFn,
                          dtype=mergedArray.dtype,
                          mode='w+',
                          shape=mergedArray.shape)
            f[:] = mergedArray
            f.flush()
            del f
            del mergedArray
 def preProcFilesExist(cls, genome, trackName, allowOverlaps):
     collector = PreProcMetaDataCollector(genome, trackName)
     preProcFilesExist = collector.preProcFilesExist(allowOverlaps)
     if preProcFilesExist is None:
         merged = cls.mergedPreProcFilesExist(genome, trackName,
                                              allowOverlaps)
         if merged:
             preProcFilesExist = True
         else:
             preProcFilesExist = cls.oldTypePreProcFilesExist(
                 genome, trackName, allowOverlaps)
         collector.updatePreProcFilesExistFlag(allowOverlaps,
                                               preProcFilesExist, merged)
     return preProcFilesExist
Beispiel #6
0
 def preProcFilesExist(genome, trackName, allowOverlaps):
     collector = PreProcMetaDataCollector(genome, trackName)
     preProcFilesExist = collector.preProcFilesExist(allowOverlaps)
     if preProcFilesExist is None:
         dirPath = createDirPath(trackName, genome, allowOverlaps=allowOverlaps)
         if BoundingRegionShelve(genome, trackName, allowOverlaps).fileExists():
             preProcFilesExist = True
             #    any( fn.split('.')[0] in ['start', 'end', 'val', 'edges'] \
             #         for fn in os.listdir(dirPath) if os.path.isfile(os.path.join(dirPath, fn)) )
         else:
             if os.path.exists(dirPath):
                 preProcFilesExist = PreProcessUtils._hasOldTypeChromSubDirs(dirPath, genome)
             else:
                 preProcFilesExist = False
         collector.updatePreProcFilesExistFlag(allowOverlaps, preProcFilesExist)
     return preProcFilesExist
Beispiel #7
0
 def removeChrMemmapFolders(genome, trackName, allowOverlaps):
     chrList = PreProcMetaDataCollector(genome, trackName).getPreProcessedChrs(allowOverlaps)
     for chr in chrList:
         path = createDirPath(trackName, genome, chr, allowOverlaps)
         assert os.path.exists(path), 'Path does not exist: ' + path
         assert os.path.isdir(path), 'Path is not a directory: ' + path
         shutil.rmtree(path)
Beispiel #8
0
    def process(self):
        geSource = self._geSourceManager.getGESource()
        genome = geSource.genome
        
        collector = PreProcMetaDataCollector(genome, self._trackName)

        from gold.origdata.PreProcessTracksJob import PreProcessTracksJob
        collector.updateMetaDataForFinalization(geSource.getFileSuffix(), geSource.getPrefixList(),
                                                geSource.getValDataType(), geSource.getValDim(),
                                                geSource.getEdgeWeightDataType(), geSource.getEdgeWeightDim(),
                                                geSource.hasUndirectedEdges(), geSource.getVersion(),
                                                PreProcessTracksJob.VERSION, PreProcessUtils.constructId(geSource),
                                                self._geSourceManager.getNumElements(),
                                                self._geSourceManager.getBoundingRegionTuples(),
                                                self._geSourceManager.getValCategories(),
                                                self._geSourceManager.getEdgeWeightCategories(),
                                                self._allowOverlaps)

        if self._geSourceManager.getNumElements() > 0:
            if self._mode == 'Real':
                output = OutputManager(genome, self._trackName, self._allowOverlaps, self._geSourceManager)
                writeFunc = output.writeRawSlice if geSource.isSliceSource() else output.writeElement
                for ge in geSource:
                    writeFunc(ge)
                output.close()
            else:
                for ge in geSource:
                    pass

        if self._mode in ['UpdateMeta', 'Real']:
            self._dirty = True
            collector.flagChrsAsPreProcessed(self._allowOverlaps, self._geSourceManager.getAllChrs())
Beispiel #9
0
 def createBoundingRegionShelve(genome, trackName, allowOverlaps):
     collector = PreProcMetaDataCollector(genome, trackName)
     boundingRegionTuples = collector.getBoundingRegionTuples(allowOverlaps)
     if not collector.getTrackFormat().reprIsDense():
         boundingRegionTuples = sorted(boundingRegionTuples)
     
     geChrList = collector.getPreProcessedChrs(allowOverlaps)
     brShelve = BoundingRegionShelve(genome, trackName, allowOverlaps)
     brShelve.storeBoundingRegions(boundingRegionTuples, geChrList, not collector.getTrackFormat().reprIsDense())
     
     #Sanity check
     if brShelve.getTotalElementCount() != collector.getNumElements(allowOverlaps):
         raise ShouldNotOccurError("Error: The total element count for all bounding regions is not equal to the total number of genome elements. %s != %s" % \
                                   (brShelve.getTotalElementCount(), collector.getNumElements(allowOverlaps)) )
    def _allGESourceManagers(self, trackName, allowOverlaps):
        trackNameStr = ':'.join(trackName)
        self._status = "Trying to create GESourceManager " \
                       "(trackName: {}, allowOverlaps: {})".format(trackNameStr, allowOverlaps)
        collector = PreProcMetaDataCollector(self._genome, trackName)
        if allowOverlaps == False and collector.overlapRuleHasBeenFinalized(
                True):
            for i in range(1):
                self._status = 'Trying to prepare preprocessing for track "%s"' % trackNameStr + \
                                (' (allowOverlaps: %s)' % allowOverlaps)
                yield self._getGESourceManagerFromTrack(trackName)
        else:
            for geSource in self._allGESources(trackName):
                if allowOverlaps == True:
                    tf = TrackFormat.createInstanceFromGeSource(geSource)
                    if tf.isDense() or geSource.hasNoOverlappingElements():
                        return

                self._status = 'Trying to prepare preprocessing for track "%s"' % trackNameStr + \
                                (' (filename: "%s")' % geSource.getFileName() if geSource.hasOrigFile() else '') + \
                                (' (allowOverlaps: %s)' % allowOverlaps)
                if PreProcessUtils.shouldPreProcessGESource(
                        trackName, geSource, allowOverlaps):
                    yield self._getGESourceManagerFromGESource(geSource)
Beispiel #11
0
 def _findTrackInfoBasedMetaData(self):
     if not self._foundTrackInfoBasedMetaData:
         if PreProcMetaDataCollector.hasKey(self._genome, self._trackName):
             collector = PreProcMetaDataCollector(self._genome, self._trackName)
             self._fileSuffix = collector.getFileSuffix()
             self._geSourceVersion = collector.getGeSourceVersion()
             self._id = collector.getId()
             self._undirectedEdges = True if collector.hasUndirectedEdges() else False
         else:
             ti = TrackInfo(self._genome, self._trackName)
             self._fileSuffix = ti.fileType
             self._geSourceVersion = ti.geSourceVersion
             self._id = ti.id
             self._undirectedEdges = True if ti.undirectedEdges else False
Beispiel #12
0
    def _createPreProcFiles(self):
        geSource = self._geSourceManager.getGESource()
        genome = geSource.genome

        collector = PreProcMetaDataCollector(genome, self._trackName)

        collector.updateMetaDataForFinalization(geSource.getFileSuffix(), geSource.getPrefixList(), \
                                                geSource.getValDataType(), geSource.getValDim(), \
                                                geSource.getEdgeWeightDataType(), geSource.getEdgeWeightDim(), \
                                                geSource.hasUndirectedEdges(),
                                                geSource.getVersion(), PreProcessUtils.constructId(geSource), \
                                                self._geSourceManager.getNumElements(), \
                                                self._geSourceManager.getBoundingRegionTuples(), \
                                                self._geSourceManager.getValCategories(), \
                                                self._geSourceManager.getEdgeWeightCategories(), \
                                                self._allowOverlaps)

        if self._geSourceManager.getNumElements() == 0:
            return

        if self._mode != 'Real':
            for ge in geSource:
                pass
            return

        output = OutputManager(genome, self._trackName, self._allowOverlaps,
                               self._geSourceManager)

        writeFunc = output.writeRawSlice if geSource.isSliceSource(
        ) else output.writeElement

        for ge in geSource:
            writeFunc(ge)

        collector.flagChrsAsPreProcessed(self._allowOverlaps,
                                         self._geSourceManager.getAllChrs())

        output.close()
    def process(self):
        assert self._genome is not None, 'Error: genome must be specified when preprocessing tracks.'

        atLeastOneFinalized = False
        for trackName in self._allTrackNames():
            assert trackName != ['']
            overlapRulesProcessedForTrackName = []
            collector = PreProcMetaDataCollector(self._genome, trackName)

            try:
                trackName = self._renameTrackNameIfIllegal(trackName)

                for allowOverlaps in [True, False]:
                    anyGeSourceManagers = False
                    for geSourceManager in self._allGESourceManagers(
                            trackName, allowOverlaps):
                        anyGeSourceManagers = True

                        # PreProcess if needed
                        if self._shouldPreProcess():
                            PreProcessUtils.removeOutdatedPreProcessedFiles(
                                self._genome, trackName, allowOverlaps,
                                self._mode)

                            if self._shouldPrintProcessMessages(
                            ) and allowOverlaps not in overlapRulesProcessedForTrackName:
                                self._printProcessTrackMessage(
                                    trackName, allowOverlaps)
                                overlapRulesProcessedForTrackName.append(
                                    allowOverlaps)

                            self._status = 'Trying to preprocess geSource...'
                            geSourceJob = PreProcessGeSourceJob(
                                trackName, geSourceManager, allowOverlaps,
                                self._mode)
                            anyWarnings = geSourceJob.process()

                            if self._raiseIfAnyWarnings and anyWarnings and trackName not in self._warningTrackNames:
                                self._warningTrackNames.append(trackName)

                            collector.updatePreProcDirtyStatus(
                                geSourceJob.hasModifiedData())

                    # Finalize overlapRule output if needed
                    if anyGeSourceManagers and self._shouldFinalize(
                    ) and collector.preProcIsDirty():
                        if self._mode == 'Real' and self._shouldMergeChrFolders(
                        ):
                            self._status = 'Trying to combine chromosome vectors into combined vectors.'
                            PreProcessUtils.createBoundingRegionShelve(
                                self._genome, trackName, allowOverlaps)
                            ChrMemmapFolderMerger.merge(
                                self._genome, trackName, allowOverlaps)

                            self._status = 'Trying to remove chromosome folders'
                            PreProcessUtils.removeChrMemmapFolders(
                                self._genome, trackName, allowOverlaps)

                            collector.updatePreProcFilesExistFlag(
                                allowOverlaps,
                                preProcFilesExist=True,
                                merged=True)

                        self._status = 'Trying to check whether 3D data is correct'
                        PreProcessUtils.checkIfEdgeIdsExist(
                            self._genome, trackName, allowOverlaps)
                        PreProcessUtils.checkUndirectedEdges(
                            self._genome, trackName, allowOverlaps)
                        PreProcessUtils.checkUndirectedEdges(
                            self._genome, trackName, allowOverlaps)
                        collector.markOverlapRuleAsFinalized(allowOverlaps)

                # Finalize track if needed
                if self._shouldFinalize():
                    if collector.preProcIsDirty():
                        self._status = 'Trying to finalize.'
                        collector.finalize(self._username,
                                           self._shouldPrintProcessMessages())
                        if not atLeastOneFinalized:
                            atLeastOneFinalized = True
                    else:
                        collector.removeEntry()

            except NotSupportedError, e:
                collector.removeEntry()
                if DebugConfig.PASS_ON_PREPROCESS_EXCEPTIONS:
                    raise_from(
                        PreprocessWarning(
                            self._addContextToExceptionMsg(e, trackName)), e)
                else:
                    self._printExceptionMsg(e, trackName, Error=False)
            except Exception, e:
                collector.removeEntry()
                if DebugConfig.PASS_ON_PREPROCESS_EXCEPTIONS:
                    raise_from(
                        PreprocessError(
                            self._addContextToExceptionMsg(e, trackName)), e)
                else:
                    self._printExceptionMsg(e, trackName, Error=True)
 def _getGESourceManagerFromTrack(self, trackName):
     origBrTuples = PreProcMetaDataCollector(
         self._genome,
         trackName).getBoundingRegionTuples(allowOverlaps=True)
     return OverlapClusteringGESourceManager(self._genome, trackName,
                                             origBrTuples)