Ejemplo n.º 1
0
        def add_covariance_for_range(r):
            print(r)
            range_size = r[1] - r[0]
            cov = np.zeros((range_size, range_size))
            range_genotypes = d.get_standardized_genotypes(r, indivs=indivs)

            def compute_cov_for_snp(m):
                end = d.buffer_around_snp(m, bandwidth, start=r[0], end=r[1],
                        units=band_units)[1]

                window_start = m - r[0]
                window_end = end - r[0]
                window = range_genotypes[:, window_start:window_end]

                cov_to_snps_in_window = \
                        range_genotypes[:,m-r[0]].T.dot(window) / range_genotypes.shape[0]
                cov_to_snps_in_window[0] /= 2 # since we're going to symmetrize later

                cov[m-r[0], window_start:window_end] = cov_to_snps_in_window
            map(compute_cov_for_snp, it.show_progress(range(r[0], r[1])))

            # symmetrization
            ranges_to_arrays[r] = cov + cov.T

            # make coding of snps consistent with other dataset
            flip = np.array(IntRangeSet(positions_to_flip) & IntRangeSet((r[0],r[1])),
                    dtype=int) - r[0] # dtype required so we can use empty array as index
            ranges_to_arrays[r][flip] *= -1
            ranges_to_arrays[r][:,flip] *= -1
Ejemplo n.º 2
0
        def compute_cov_for_slice(s):
            indices = IntRangeSet((s[0] if s[0] == 0 else s[0] + int(bandwidth/2),
                s[1] if s[1] == d.M else s[1] - int(bandwidth/2)))
            indices = indices & snpset_irs

            if indices.isempty: # if there are no indices to analyze then we can move on
                return
            print(s)
            slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs)
            snpset_relative_to_slice = IntRangeSet([
                (x-s[0],y-s[0]) for x,y in snpset_irs.ranges()])

            def compute_cov_for_snp(m):
                # we just compute the numbers needed for the top trianglular half
                # of the LD matrix, then we symmetrize the matrix. (commented line is old)
                # start = max(0, m - int(bandwidth/2))
                start = m
                end = min(slice_genotypes.shape[1], m + int(bandwidth/2))

                window_indices = IntRangeSet((start, end)) & snpset_relative_to_slice
                window = slice_genotypes[:, window_indices]

                cov_to_snps_in_window = slice_genotypes[:,m].T.dot(window) / len(indivs)
                cov_to_snps_in_window[0] /= 2 # since we're going to symmetrize later

                target_indices = IntRangeSet((s[0] + start, s[0] + end)) & snpset_irs
                lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window
            map(compute_cov_for_snp,
                    it.show_progress([x - s[0] for x in indices]))
Ejemplo n.º 3
0
def main(args):
    print('reading seeed snps')
    seed_snps = pd.read_csv(args.seed_snps, header=None, names=['SNP'], index_col='SNP')
    seed_snps['ibs_length'] = 0
    seed_snps['ibd'] = 0

    print('reading typed snps')
    typed_snps = pd.read_csv(args.typed_snps, header=None, names=['SNP'])

    print('reading genotypes')
    data = Bed(args.bfile)
    X = data.read().val
    typed_snps_indices = np.sort(data.sid_to_index(typed_snps.SNP))
    typed_snps_bp = data.col_property[typed_snps_indices,2]

    print(len(seed_snps), 'snps in list')
    print(data.iid_count, data.sid_count, 'are dimensions of X')

    def analyze_snp(i):
        # find first typed snp after query snp
        snp_bp = data.col_property[i,2]
        v = np.where(typed_snps_bp > snp_bp)[0]
        if len(v) > 0:
            typed_i = v[0]
        else:
            typed_i = len(typed_snps_indices)-1

        n1, n2 = np.where(X[:,i] == 1)[0]
        if (X[n1,typed_snps_indices[typed_i]] - X[n2, typed_snps_indices[typed_i]])**2 == 4:
            return 0, 0

        typed_il, typed_ir = fis.find_boundaries(
                X[n1,typed_snps_indices],
                X[n2,typed_snps_indices],
                typed_i)
        typed_ir -= 1

        il = typed_snps_indices[typed_il]
        ir = typed_snps_indices[typed_ir]
        cM = data.col_property[ir, 1] - \
                data.col_property[il, 1]
        ibd = (np.mean(X[n1,il:ir] == X[n2,il:ir]) > 0.99)
        return cM, int(ibd)

    for (i, snp) in iter.show_progress(
            it.izip(data.sid_to_index(seed_snps.index), seed_snps.index),
            total=len(seed_snps)):
            # total=10):
        seed_snps.ix[snp, ['ibs_length', 'ibd']] = analyze_snp(i)

    print(seed_snps.iloc[:100])
    seed_snps.to_csv(args.outfile, sep='\t')
Ejemplo n.º 4
0
        def compute_cov_for_slice(s):
            indices = IntRangeSet(
                (s[0] if s[0] == 0 else s[0] + int(bandwidth / 2),
                 s[1] if s[1] == d.M else s[1] - int(bandwidth / 2)))
            indices = indices & snpset_irs

            if indices.isempty:  # if there are no indices to analyze then we can move on
                return
            print(s)
            slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs)
            snpset_relative_to_slice = IntRangeSet([
                (x - s[0], y - s[0]) for x, y in snpset_irs.ranges()
            ])

            def compute_cov_for_snp(m):
                # we just compute the numbers needed for the top trianglular half
                # of the LD matrix, then we symmetrize the matrix. (commented line is old)
                # start = max(0, m - int(bandwidth/2))
                start = m
                end = min(slice_genotypes.shape[1], m + int(bandwidth / 2))

                window_indices = IntRangeSet(
                    (start, end)) & snpset_relative_to_slice
                window = slice_genotypes[:, window_indices]

                cov_to_snps_in_window = slice_genotypes[:, m].T.dot(
                    window) / len(indivs)
                cov_to_snps_in_window[
                    0] /= 2  # since we're going to symmetrize later

                target_indices = IntRangeSet(
                    (s[0] + start, s[0] + end)) & snpset_irs
                lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window

            map(compute_cov_for_snp,
                it.show_progress([x - s[0] for x in indices]))