Beispiel #1
0
def load_data(xfile, maxlen=5000, hard_stop=1e60):
    g = [ii.strip().split() for ii in gz(xfile)]
    k = [
        idx for idx, i in enumerate(g) if len(i) > 0 and i[0].startswith('//')
    ]
    f = []
    lens = []
    all_pos = []
    for idx, i in enumerate(k):
        l = g[i + 3:i + 211]
        pos = np.array(map(float, g[i + 2][1:]), dtype="float32")
        #print l[0][:10], l[-1][:10]
        #print len(l)
        q = []
        for i in l:
            i = list(i[0])
            q.append(np.array([int(j) for j in i], dtype='int8'))
        #print len(q)
        #print q[0][:10], q[-1][:10]
        q = sort_min_diff(np.array(q)).T
        if q.shape[0] <= maxlen:
            f.append(q)
            #    print q.shape
            all_pos.append(pos)
        if len(f) > 9:
            if not len(f) % 10: print idx, len(f), len(all_pos)
        lens.append(len(q[0]))
        if len(f) >= hard_stop: break
    print len([i for i in lens if i > maxlen]), len(lens)
    print '*********', len(f), len(all_pos)
    return f, all_pos
 def _gunzip(self, fileobjin, fileobjout):
     """Returns NamedTemporaryFile with unzipped content of fileobj"""
     
     source = gz(fileobj=fileobjin, mode='rb')
     
     target = fileobjout
     
     try:
         while 1:
             data=source.read(65536)
             if data and len(data):
                 target.write(data)
             else:
                 target.flush()
                 break
     except Exception:
         target.close()
         raise
     else:
         return target
Beispiel #3
0
 def _gunzip(self, fileobjin, fileobjout):
     """
     Returns NamedTemporaryFile with unzipped content of fileobj.
     @type fileobjin: File
     @param fileobjin: file containing the archive
     @type fileobjout: File
     @param fileobjout: file where to put the unziped file
     """
     source = gz(fileobj=fileobjin, mode='rb')
     target = fileobjout
     try:
         while 1:
             data=source.read(65536)
             if data and len(data):
                 target.write(data)
             else:
                 target.flush()
                 break
     except Exception:
         target.close()
         raise
     else:
         return target
Beispiel #4
0
from common import *
from gzip import GzipFile as gz
from sklearn.neighbors import NearestNeighbors


def sort_min_diff(amat):
    '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity.
    this problem is NP-hard, so here we use a nearest neighbors approx.  it's not perfect, but it's fast and generally performs ok.
    assumes your input matrix is a numpy array'''
    mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat)
    v = mb.kneighbors(amat)
    smallest = np.argmin(v[0].sum(axis=1))
    return amat[v[1][smallest]]


a = (i.strip().split('asdfasdfd') for i in gz('gap.LD.sims.txt.gz'))

idx = 0
xvals, yvals = {}, {}
seg_sites = {}
pos = {}
n = []
ctr = 0
true_idx = 0

for i in a:
    if 'segsites' in i[0]:
        seg_sites[true_idx] = int(i[0].split()[-1])
    if 'positions' in i[0]:
        pos[true_idx] = [float(jj) for jj in i[0].split()[1:]]
    if './ms' in i[0]:
    '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity.
    this problem is NP-hard, so here we use a nearest neighbors approx.  it's not perfect, but it's fast and generally performs ok.
    assumes your input matrix is a numpy array'''
    mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat)
    v = mb.kneighbors(amat)
    smallest = np.argmin(v[0].sum(axis=1))
    return amat[v[1][smallest]]


def convert_01_to_neg1_1(amat):
    '''convert standard binary 0/1 ms SNP matrix to -1/1 SNP matrix. B/c weights & biases are drawn from a distribution with mean=0
    choosing -1/1 (which is also mean=0) tends to help in training. assumes your input matrix is a numpy array'''
    return (amat * -2 + 1) * -1


a = (i.strip().split('asdfasdfd') for i in gz('all.auto.tet.LD.sims.txt.gz'))

k = range(48)
idx = 0
xvals, yvals = {}, {}
seg_sites = {}
pos = {}
n = []
ctr = 0
true_idx = 0
for i in a:
    if 'segsites' in i[0]:
        seg_sites[true_idx] = int(i[0].split()[-1])
    if 'positions' in i[0]:
        pos[true_idx] = [float(jj) for jj in i[0].split()[1:]]
    if './ms' in i[0]:
def convert_01_to_neg1_1(amat):
    '''convert standard binary 0/1 ms SNP matrix to -1/1 SNP matrix. B/c weights & biases are drawn from a distribution with mean=0
    choosing -1/1 (which is also mean=0) tends to help in training. assumes your input matrix is a numpy array'''
    return (amat * -2 + 1) * -1


def rsquare(x, y):
    return np.corrcoef(x, y)[0][1]**2  #r-squared


def rmse(x, y):
    return np.sqrt(np.mean((x - y)**2))


a = (i.strip().split('asdfasdfd')
     for i in gz('autotet.test.data.LD.sims.txt.gz'))

k = range(48)
idx = 0
xvals, yvals = {}, {}
seg_sites = {}
pos = {}
n = []
ctr = 0
true_idx = 0
maxL = 0

for i in a:
    if 'segsites' in i[0]:
        seg_sites[true_idx] = int(i[0].split()[-1])
        if seg_sites[true_idx] > maxL: maxL = seg_sites[true_idx]