Beispiel #1
0
def get_svd_learn_clusters(accu_path, data=None, sing_threshold=2.0, assign_clstr=0.1, vis=False):
    """First runs the decomposition for maximum number of singular values.
    Then reruns on a subset > than some value"""

    (N, f) = data.shape
    all_components = min(N,f)
    U, Sigma, VT = randomized_svd(data, n_components=all_components, n_iter=5, random_state=None)

    # print "Sigma:", Sigma
    best_components = sum(Sigma > sing_threshold)
    U, Sigma, VT = randomized_svd(data, n_components=best_components, n_iter=5, random_state=None)
    pred_labels = [np.argmax(doc) if np.max(doc) > assign_clstr else 100 for doc in U]
    # print "predicted classes:", pred_labels

    utils.screeplot(accu_path, Sigma, all_components, vis)

    """Plot a graph for each right singular vector (VT)"""
    max_, min_ = 0, 100
    min_=100
    for i in VT:
        if max(i)>max_: max_ = max(i)
        if min(i)<min_: min_ = min(i)

    if vis:
        with open(accu_path + "/graphlets.p", 'r') as f:
            graphlets = pickle.load(f)

    for i, vocabulary in enumerate(VT):
        title = 'Latent Concept %s' % i
        utils.genome(accu_path, vocabulary, [min_, max_], title)
        if vis:
            for c, v in enumerate(vocabulary):
                if v > 0.1:
                    print "\n",c,  graphlets[c]
    return U, Sigma, VT
Beispiel #2
0
def loadhic(filename,
            genome='hg19',
            resolution=100000,
            usechr=['#', 'X'],
            verbose=False):
    from . import straw

    tgenome = alabutils.genome(genome)
    bininfo = tgenome.bininfo(resolution)

    m = contactmatrix(len(bininfo.chromList),
                      genome=genome,
                      resolution=resolution,
                      usechr=usechr)
    for chr1 in tgenome.info['chrom']:
        i = tgenome.getchrnum(chr1)
        for chr2 in tgenome.info['chrom']:
            j = tgenome.getchrnum(chr2)
            if i > j:
                continue
            if verbose:
                print chr1, chr2

            result = straw.straw("NONE", filename, chr1[3:], chr2[3:], 'BP',
                                 resolution)
            for t in range(len(result[0])):
                x = int(result[0][t] / resolution) + bininfo.binStart[i]
                y = int(result[1][t] / resolution) + bininfo.binStart[j]
                m.matrix[x, y] = result[2][t]
                m.matrix[y, x] = result[2][t]
            #-
        #--
    #--

    return m
Beispiel #3
0
def get_svd_learn_clusters(accu_path,
                           data=None,
                           sing_threshold=2.0,
                           assign_clstr=0.1,
                           vis=False):
    """First runs the decomposition for maximum number of singular values.
    Then reruns on a subset > than some value"""

    (N, f) = data.shape
    all_components = min(N, f)
    U, Sigma, VT = randomized_svd(data,
                                  n_components=all_components,
                                  n_iter=5,
                                  random_state=None)

    # print "Sigma:", Sigma
    best_components = sum(Sigma > sing_threshold)
    U, Sigma, VT = randomized_svd(data,
                                  n_components=best_components,
                                  n_iter=5,
                                  random_state=None)
    pred_labels = [
        np.argmax(doc) if np.max(doc) > assign_clstr else 100 for doc in U
    ]
    # print "predicted classes:", pred_labels

    utils.screeplot(accu_path, Sigma, all_components, vis)
    """Plot a graph for each right singular vector (VT)"""
    max_, min_ = 0, 100
    min_ = 100
    for i in VT:
        if max(i) > max_: max_ = max(i)
        if min(i) < min_: min_ = min(i)

    if vis:
        with open(accu_path + "/graphlets.p", 'r') as f:
            graphlets = pickle.load(f)

    for i, vocabulary in enumerate(VT):
        title = 'Latent Concept %s' % i
        utils.genome(accu_path, vocabulary, [min_, max_], title)
        if vis:
            for c, v in enumerate(vocabulary):
                if v > 0.1:
                    print "\n", c, graphlets[c]
    return U, Sigma, VT
def dump_lda_output(path, doc_topic, topic_word):
    f = open(os.path.join(path, "doc_topic.p"), "w")
    pickle.dump(doc_topic, f)
    f.close()

    f = open(os.path.join(path, "topic_word.p"), "w")
    pickle.dump(topic_word, f)
    f.close()
    """Plot a graph for each topic word distribution (vocabulary)"""
    max_, min_ = 0, 100
    min_ = 100
    for i in topic_word:
        if max(i) > max_: max_ = max(i)
        if min(i) < min_: min_ = min(i)

    for i, vocabulary in enumerate(topic_word):
        title = 'Topic %s' % i
        utils.genome(path, vocabulary, [min_, max_], title)
def dump_lda_output(path, doc_topic, topic_word):
    f = open(os.path.join(path, "doc_topic.p"), "w")
    pickle.dump(doc_topic, f)
    f.close()

    f = open(os.path.join(path, "topic_word.p"), "w")
    pickle.dump(topic_word, f)
    f.close()

    """Plot a graph for each topic word distribution (vocabulary)"""
    max_, min_ = 0, 100
    min_=100
    for i in topic_word:
        if max(i)>max_: max_ = max(i)
        if min(i)<min_: min_ = min(i)

    for i, vocabulary in enumerate(topic_word):
        title = 'Topic %s' % i
        utils.genome(path, vocabulary, [min_, max_], title)
Beispiel #6
0
    def __init__(self,filename,genome=None,resolution=None,usechr=['#','X']):
        self._applyedMethods = {}
        if isinstance(filename,int):
            self.matrix=np.zeros((filename,filename),dtype = np.float32)
        elif isinstance(filename,str):
            if not os.path.isfile(filename):
                raise IOError,"File %s doesn't exist!\n" % (filename)
            if os.path.splitext(filename)[1] == '.hdf5' or os.path.splitext(filename)[1] == '.hmat':
                h5f = h5py.File(filename,'r')
                self.matrix = h5f['matrix'][:]
                self.idx    = h5f['idx'][:]
                if 'applyedMethods' in h5f.keys():
                    self._applyedMethods = cPickle.loads(h5f['applyedMethods'].value)
                
                if 'genome' in h5f.keys() and 'resolution' in h5f.keys():         
                    self.genome     = cPickle.loads(h5f['genome'].value)
                    self.resolution = cPickle.loads(h5f['resolution'].value)
                h5f.close()
            else:
                from alabio import loadstream
                f    = loadstream(filename)
                s    = f.next()
                line = re.split('\t+|\s+',s.rstrip())
                n    = len(line) - 3
                idx  = []
                i    = 0
                tidx = line[0:3];tidx.append('')
                idx.append(tidx)
                self.matrix = np.zeros((n,n),dtype = np.float32)
                self.matrix[i] = line[3:]
                for s in f:
                    i += 1
                    line = re.split('\t+|\s+',s.rstrip())
                    tidx = line[0:3];tidx.append('')
                    idx.append(tidx)
                    self.matrix[i] = line[3:]
                f.close()
                self.idx    = np.core.records.fromarrays(np.array(idx).transpose(),dtype=self._idxdtype)
        else:
			raise RuntimeError, "Undefined input filename type!\n"
        #----------------end filename
        
        if isinstance(genome,str) and isinstance(resolution,int):
            if hasattr(self,"genome") and hasattr(self,"resolution"):
                raise RuntimeError, "Genome and resolution has already been specified."
            genomedb    = alabutils.genome(genome,usechr=usechr)
            bininfo     = genomedb.bininfo(resolution)
            flaglist    = ['' for i in range(len(bininfo.chromList))]
            self.genome = genome
            self.resolution = resolution
            self._buildindex(bininfo.chromList,bininfo.startList,bininfo.endList,flaglist)
Beispiel #7
0
 def __init__(self,probfile,nucleusRadius=5000.0,contactRange=1,level=None,record=-1):
     self.probmat = matrix.contactmatrix(probfile)
     self.nbead   = len(self.probmat)
     #setup log
     LEVELS={'debug':logging.DEBUG,'info':logging.INFO,'warning':logging.WARNING,'error':logging.ERROR,'critical':logging.CRITICAL}
     loglevel = LEVELS.get(level,logging.NOTSET)
     self.logger = logging.getLogger()
     self.logger.setLevel(loglevel)
     self._log_capture_string = StringIO()
     chhandler = logging.StreamHandler(self._log_capture_string)
     chhandler.setLevel(loglevel)
     self.logger.addHandler(chhandler)
     self.logger.setLevel(loglevel)
     #setup record
     self._record_step = record
     if record >= 100:
         self.record = []
     #CONST
     rscale               = 1.38                  # 20% occupancy
     self.nucleusRadius   = nucleusRadius         # nm
     cdensity             = 107.45                # bp/nm assuming 197 bp/nucleosomes and 6 nucleosome/11 nm
     kscale               = (0.75*15**2)**(1.0/3.0) # 3/4*r**2 where r=15nm
     self.contactRange    = contactRange          # surface to surface distance scale of (r1+r2)
                                                  # for which 2 beads are considered as contact 
     #get radius of each bead
     self.beadRadius = [rscale * kscale * ((index['end'] - index['start'])/cdensity) ** (1.0/3.0) for index in self.probmat.idx]
     #calculate the total volumn of DNA (diploid) and nucleus
     dnavol   = sum(4. * 3.1415/3. * np.array(self.beadRadius)**3) * 2 
     nucvol   = (4*3.1415/3)*self.nucleusRadius**3
     #And chromosome occupancy
     dnaocc   = dnavol / nucvol
     self.logger.debug('occupancy: %.2f with Rnuc %d'%(dnaocc,self.nucleusRadius))
     #diploid Rb; 2xtotal haploid beads 
     self.beadRadius = self.beadRadius + self.beadRadius
     # Chromosome territory apply
     self.genome = utils.genome(self.probmat.genome)
     cscale=1.0
     chrvol = nucvol * self.genome.info['length']/sum(self.genome.info['length'])/2
     self.chromRadius=cscale*((chrvol/4*3/3.1415)**(1./3.))
     
     #record starting time
     self.model      = IMP.Model()
     self.chain      = IMP.container.ListSingletonContainer(self.model)
     self.restraints = IMP.RestraintSet(self.model)
     #IMP.set_check_level(IMP.USAGE)
     IMP.set_check_level(IMP.NONE)
     IMP.set_log_level(IMP.SILENT)
     #setup nucleus envelope
     self.center = IMP.algebra.Vector3D(0,0,0)
Beispiel #8
0
    def __init__(self,
                 filename,
                 genome=None,
                 resolution=None,
                 usechr=['#', 'X']):
        self._applyedMethods = {}
        if isinstance(filename, int):
            self.matrix = np.zeros((filename, filename), dtype=np.float32)
        elif isinstance(filename, str):
            if not os.path.isfile(filename):
                raise IOError, "File %s doesn't exist!\n" % (filename)
            if os.path.splitext(filename)[1] == '.hdf5' or os.path.splitext(
                    filename)[1] == '.hmat':
                h5f = h5py.File(filename, 'r')
                self.matrix = h5f['matrix'][:]
                self.idx = h5f['idx'][:]
                if 'applyedMethods' in h5f.keys():
                    self._applyedMethods = cPickle.loads(
                        h5f['applyedMethods'].value)

                if 'genome' in h5f.keys() and 'resolution' in h5f.keys():
                    self.genome = cPickle.loads(h5f['genome'].value)
                    self.resolution = cPickle.loads(h5f['resolution'].value)
                h5f.close()
            else:
                from alabio import loadstream
                f = loadstream(filename)
                s = f.next()
                line = re.split('\t+|\s+', s.rstrip())
                n = len(line) - 3
                expectn = n
                if isinstance(genome, str) and isinstance(resolution, int):
                    genomedb = alabutils.genome(genome, usechr=usechr)
                    bininfo = genomedb.bininfo(resolution)
                    expectn = len(bininfo.chromList)
                if expectn != n:
                    raise RuntimeError, "Dimension don't match, expected %s bins , get %s bins. Please check the input." % (
                        expectn, n)
                idx = []
                i = 0
                tidx = line[0:3]
                tidx.append('')
                idx.append(tidx)
                self.matrix = np.zeros((n, n), dtype=np.float32)
                self.matrix[i] = line[3:]
                for s in f:
                    i += 1
                    line = re.split('\t+|\s+', s.rstrip())
                    tidx = line[0:3]
                    tidx.append('')
                    idx.append(tidx)
                    self.matrix[i] = line[3:]
                f.close()
                self.idx = np.core.records.fromarrays(
                    np.array(idx).transpose(), dtype=self._idxdtype)
        else:
            raise RuntimeError, "Undefined input filename type!\n"
        #----------------end filename

        if isinstance(genome, str) and isinstance(resolution, int):
            if hasattr(self, "genome") and hasattr(self, "resolution"):
                raise RuntimeError, "Genome and resolution has already been specified."
            genomedb = alabutils.genome(genome, usechr=usechr)
            bininfo = genomedb.bininfo(resolution)
            flaglist = ['' for i in range(len(bininfo.chromList))]
            self.genome = genome
            self.resolution = resolution
            self._buildindex(bininfo.chromList, bininfo.startList,
                             bininfo.endList, flaglist)
Beispiel #9
0
    def __init__(self,
                 probfile,
                 nucleusRadius=5000.0,
                 chromosomeOccupancy=0.2,
                 contactRange=1,
                 level=None):
        self.probmat = alabmatrix.contactmatrix(probfile)
        self.nbead = len(self.probmat)
        #setup log
        LEVELS = {
            'debug': logging.DEBUG,
            'info': logging.INFO,
            'warning': logging.WARNING,
            'error': logging.ERROR,
            'critical': logging.CRITICAL
        }
        loglevel = LEVELS.get(level, logging.NOTSET)
        self.logger = logging.getLogger()
        self.logger.setLevel(loglevel)
        self._log_capture_string = io.StringIO()
        chhandler = logging.StreamHandler(self._log_capture_string)
        chhandler.setLevel(loglevel)
        self.logger.addHandler(chhandler)
        self.logger.setLevel(loglevel)
        #CONST
        #rscale               = 1.38                  # 20% occupancy
        self.occupancy = chromosomeOccupancy  #chromosome occupancy in nucleus, defined as diploid_domain_total_volume/nuclear_volume

        self.nucleusRadius = nucleusRadius  # nm
        #cdensity             = 107.45                # bp/nm assuming 197 bp/nucleosomes and 6 nucleosome/11 nm
        #kscale               = (0.75*15**2)**(1.0/3.0) # 3/4*r**2 where r=15nm
        self.contactRange = contactRange  # surface to surface distance scale of (r1+r2)
        # for which 2 beads are considered as contact
        self.genome = alabutils.genome(self.probmat.genome)
        rho = self.occupancy * self.nucleusRadius**3 / (
            2 * sum(self.genome.info['length']))
        #get radius of each bead
        self.beadRadius = [(rho * (index['end'] - index['start']))**(1.0 / 3.0)
                           for index in self.probmat.idx]
        #self.beadRadius = [rscale * kscale * ((index['end'] - index['start'])/cdensity) ** (1.0/3.0) for index in self.probmat.idx]
        #calculate the total volumn of DNA (diploid) and nucleus
        dnavol = sum(4. * 3.1415 / 3. * np.array(self.beadRadius)**3) * 2
        nucvol = (4 * 3.1415 / 3) * self.nucleusRadius**3
        #And chromosome occupancy
        dnaocc = dnavol / nucvol
        self.logger.debug(u'Occupancy: %.2f with Rnuc %d' %
                          (dnaocc, self.nucleusRadius))
        #diploid Rb; 2xtotal haploid beads
        self.beadRadius = self.beadRadius + self.beadRadius
        # Chromosome territory apply
        #self.genome = alabutils.genome(self.probmat.genome)
        cscale = 1.0
        chrvol = nucvol * self.genome.info['length'] / sum(
            self.genome.info['length']) / 2
        self.chromRadius = cscale * ((chrvol / 4 * 3 / 3.1415)**(1. / 3.))

        #record starting time
        self.model = IMP.Model()
        self.chain = IMP.container.ListSingletonContainer(self.model)
        self.restraints = IMP.RestraintSet(self.model)
        #IMP.set_check_level(IMP.USAGE)
        IMP.set_check_level(IMP.NONE)
        IMP.set_log_level(IMP.SILENT)
        #setup nucleus envelope
        self.center = IMP.algebra.Vector3D(0, 0, 0)