Ejemplo n.º 1
0
 def processItem(self, item):
     hash = item.hash()
     from cache import cacheFolder, cached
     cacheFile = join(cacheFolder, hash)
     try:
         data = urlopen(item.url).read()
         open(cacheFile, "wb").write(data)
         cached(item)
     except URLError:
         item.failed = True
         item.save()
Ejemplo n.º 2
0
 def processItem(self,item):
     hash = item.hash()
     from cache import cacheFolder, cached
     cacheFile = join(cacheFolder, hash)
     try:
         data = urlopen(item.url).read()
         open(cacheFile, "wb").write(data)
         cached(item)
     except URLError:
         item.failed = True
         item.save()
Ejemplo n.º 3
0
    def __init__(self, xx):
        '''
        Inspired by PCA in matplotlib.mlab

        Compute the principle components of a dataset and stores the
        mean, sigma, and SVD of sigma for the data.  Use toPC and
        fromPC to project the data onto a reduced set of dimensions
        and back. This version takes the SVD of the covariance matrix.

        Inputs:

          *xx*: a numobservations x numdims array

        Attrs:

          *nn*, *mm*: the dimensions of xx

          *mu* : a numdims array of means of xx

          *sigma* : the covariance matrix

          *var* : the average amount of variance of each of the principal components

          *std* : sqrt of var

          *fracVar* : the fractional amount of variance from each principal component

          *fracStd* : sqrt of fracVar
        '''

        self.nn, self.mm = xx.shape
        if self.nn < self.mm:
            raise RuntimeError('we assume data in a is organized with numrows>numcols')

        self.mu          = xx.mean(axis=0)
        centeredXX       = self.center(xx)
        #self.sigma       = dot(centeredXX.T, centeredXX) / self.nn
        self.sigma       = cached(dot, centeredXX.T, centeredXX) / self.nn

        # Columns of UU are the eigenvectors of self.sigma, i.e. the
        # principle components. UU and VV are transpose of each other;
        # we don't use VV. ss is the diagonal of the true S matrix.
        #self.UU, self.ss, self.VV = linalg.svd(self.sigma, full_matrices = False)
        self.UU, self.ss, self.VV = cached(linalg.svd, self.sigma, full_matrices = False)

        self.var = self.ss / float(self.nn)
        self.std = sqrt(self.var)
        self.fracVar = self.var / self.var.sum()
        self.fracStd = self.std / self.std.sum()
Ejemplo n.º 4
0
 def diff(self):
     # Doing this weird to make caching work.
     def differ(a, b):
         return Diff(a, b)
     prev = (Revision.q.filter(Revision.svn_id < self.svn_id)
             .order_by(Revision.svn_id.desc()).first())
     return cached(differ)(self, prev)
# In[2]:

sns.set(context='talk')

# In[3]:

model_name = 'model5'
by = 'SubSet'
sample_n = 1000

# ## load files for all cell types

# In[4]:

df = cache.cached(data.prep_annotated_data)

# In[5]:

assert all(pd.notnull(df['log1p_tpm_rescaled']))

# In[6]:

print(df.columns)

#apply(lambda x: x.startswith('C'))

# ## sample genes for analysis

# In[7]:
Ejemplo n.º 6
0
except ImportError:
    from StringIO import StringIO

import simplejson

from lanshark.config import config

import logging
logger = logging.getLogger('lanshark')

from lanshark import icons
from lanshark import network
from lanshark import sendfile

from cache import cached
socket.getaddrinfo = cached(config.CACHE_TIMEOUT, stats=config.debug)(
        socket.getaddrinfo)


iconpath = os.path.join(config.DATA_PATH, "icons", "32x32")
iconfactory = icons.URLIconFactory(iconpath, "/__data__/icons/128x128/", ".png")
hidden_files = [re.compile(pattern) for pattern in config.HIDDEN_FILES]

def hidden(filename):
    return any(pattern.match(filename) for pattern in hidden_files)

class FileIndex(threading.Thread):
    """
    The fileindex offers fast searching over
    a periodicaly updated file index
    """
    def __init__(self, path):
# In[1]:

import data
import models
import cache
import seaborn as sns

# In[2]:

model_name = 'model5.2'
by = 'cell_type'
sample_n = 500

# In[3]:

sample_df = cache.cached(models.prep_sample_df, sample_n=sample_n)
(training_df, test_df) = models.split_sample_df(sample_df=sample_df,
                                                test_sample_n=1)

# In[4]:

model_file = models.get_model_file(model_name=model_name)
#print(cache._read_file(model_file))

# In[5]:

stan_data = models.prep_stan_data(sample_df=training_df,
                                  test_df=test_df,
                                  by=by)

# In[ ]:

# In[3]:

model_name = 'model5.3'
by = 'SubSet'
sample_n = 100


# ## get data, as we did in earlier examples

# This will help in case we want to compare estimates for particular genes or samples

# In[4]:

sample_df = cache.cached(models.prep_sample_df, sample_n=sample_n)


# In[5]:

stan_data1 = models.prep_stan_data(sample_df, by=by, nu=1)
stan_data2 = models.prep_stan_data(sample_df, by=by, nu=2)
stan_data3 = models.prep_stan_data(sample_df, by=by, nu=3)
stan_data4 = models.prep_stan_data(sample_df, by=by, nu=4)
stan_data5 = models.prep_stan_data(sample_df, by=by, nu=5)
stan_data6 = models.prep_stan_data(sample_df, by=by, nu=6)


# In[6]:

model = models.get_model_file(model_name=model_name)