Ejemplo n.º 1
0
    def print_subsets(cls, outfilename, snpsubsets, names, add_other=False):
        def snp_info_df(d):
            bfile = d.genotypes_bedfile.filename
            return pd.read_csv(bfile + '.bim',
                    delim_whitespace=True,
                    usecols=[0,1,2,3],
                    names=['CHR','SNP','CM','BP'])

        # check that all snpsubsets have the same data set
        if len(set([ss.dataset for ss in snpsubsets])) > 1:
            print('error: all subsets must have the same underlying dataset')
            return
        if not outfilename.endswith('.gz'):
            print('outfilename must end with ".gz". I only write zipped files')
            return

        # get snp info for this dataset
        d = snpsubsets[0].dataset
        df = snp_info_df(d)

        # add the 'other' annotation if necessary
        if add_other:
            union = IntRangeSet()
            for ss in snpsubsets:
                union.update(ss.irs)
            snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union))
            names.append('OTHER')

        # create the pandas dataframe and output it
        for name, ss in zip(names, snpsubsets):
            df[name] = 0
            df.ix[[i for i in ss.irs], name] = 1
        df = df[['CHR','BP','SNP','CM'] + names]
        with gzip.open(outfilename, 'wt') as write_file:
            df.to_csv(write_file, index=False, sep='\t')
Ejemplo n.º 2
0
        def add_covariance_for_range(r):
            print(r)
            range_size = r[1] - r[0]
            cov = np.zeros((range_size, range_size))
            range_genotypes = d.get_standardized_genotypes(r, indivs=indivs)

            def compute_cov_for_snp(m):
                end = d.buffer_around_snp(m, bandwidth, start=r[0], end=r[1],
                        units=band_units)[1]

                window_start = m - r[0]
                window_end = end - r[0]
                window = range_genotypes[:, window_start:window_end]

                cov_to_snps_in_window = \
                        range_genotypes[:,m-r[0]].T.dot(window) / range_genotypes.shape[0]
                cov_to_snps_in_window[0] /= 2 # since we're going to symmetrize later

                cov[m-r[0], window_start:window_end] = cov_to_snps_in_window
            map(compute_cov_for_snp, it.show_progress(range(r[0], r[1])))

            # symmetrization
            ranges_to_arrays[r] = cov + cov.T

            # make coding of snps consistent with other dataset
            flip = np.array(IntRangeSet(positions_to_flip) & IntRangeSet((r[0],r[1])),
                    dtype=int) - r[0] # dtype required so we can use empty array as index
            ranges_to_arrays[r][flip] *= -1
            ranges_to_arrays[r][:,flip] *= -1
Ejemplo n.º 3
0
def get_high_ld_snps(subset, matrix):
    result = IntRangeSet()
    for i in subset:
        snps = IntRangeSet(np.flatnonzero(matrix[i]**2 > args.R2_threshold))
        snps -= subset
        result += snps
    return result
Ejemplo n.º 4
0
        def add_covariance_for_range(r):
            print(int(time()-t0), ':', r)
            range_genotypes = d.get_standardized_genotypes(r, indivs=indivs)
            ranges_to_arrays[r] = \
                range_genotypes.T.dot(range_genotypes) / range_genotypes.shape[0]

            # make coding of snps consistent with other dataset
            flip = np.array(IntRangeSet(positions_to_flip) & IntRangeSet((r[0],r[1])),
                    dtype=int) - r[0] # dtype required so we can use empty array as index
            ranges_to_arrays[r][flip] *= -1
            ranges_to_arrays[r][:,flip] *= -1
Ejemplo n.º 5
0
 def __init__(self, dataset, bedtool=None, irs=None):
     # use bedtools to create an indicator vector for the snps membership in the subset
     self.dataset = dataset
     if bedtool:
         indicator = dataset.snp_coords().intersect(bedtool, c=True)
         self.irs = IntRangeSet(np.flatnonzero(
             np.array([int(snp.name) for snp in indicator])))
     elif irs:
         self.irs = irs
     else:
         self.irs = IntRangeSet()
Ejemplo n.º 6
0
 def __zero_block_outside_irs(self, r, other_intrangeset):
     my_intrangeset = IntRangeSet(r)
     intersection_intrangeset = my_intrangeset & other_intrangeset
     if intersection_intrangeset.isempty:
         del self.ranges_to_arrays[r]
     else:
         mask = np.zeros(len(my_intrangeset), dtype=bool)
         for s in intersection_intrangeset.ranges():
             start = my_intrangeset.index(s[0])
             end = start + s[1] - s[0]
             mask[start:end] = True
         self.ranges_to_arrays[r][~mask] = 0
         self.ranges_to_arrays[r].T[~mask] = 0 # for compatibility with 1d arrays
Ejemplo n.º 7
0
    def plot(self, irs_to_mark, filename=None):
        import matplotlib.pyplot as plt
        rows = int(math.ceil(math.sqrt(len(self.ranges()))))
        cols = max(int(math.ceil(len(self.ranges()) / rows)), 2) # the max is so we get
                                                                # back a 2-D array always
        fig, axes = plt.subplots(nrows=rows, ncols=cols)
        for ax,(r, A) in zip(fig.axes, self.ranges_to_arrays.items()):
            width = r[1] - r[0]
            ax.matshow(A, vmin=-1, vmax=1)
            # import pdb; pdb.set_trace()
            my_intrangeset = IntRangeSet(r)
            intersection = my_intrangeset & irs_to_mark
            def draw_line(xs, ys):
                ax.plot(xs, ys, transform=ax.transAxes, lw=0.2, color='k')

            for s in intersection.ranges():
                draw_line([(s[0] - r[0])/width, (s[0] - r[0])/width],
                        [0, 1])
                draw_line([(s[1] - r[0])/width, (s[1] - r[0])/width],
                        [0, 1])
                draw_line([0,1],
                        [(r[1] - s[0])/width, (r[1] - s[0])/width])
                draw_line([0,1],
                        [(r[1] - s[1])/width, (r[1] - s[1])/width])
            ax.set_xticks([0,width]); ax.set_yticks([0,width])
            ax.set_xlim(0,width); ax.set_ylim(width, 0)
            ax.set_title(str(r))

        fig.set_size_inches(axes.shape[0] * 3, axes.shape[1]*4)
        if filename:
            fig.savefig(filename, dpi=400)
        else:
            fig.show()
Ejemplo n.º 8
0
    def preprocess(self, use_filesystem=True):
        if not self.covariance_preprocessing_in_progress(
        ) or not use_filesystem:
            print('creating covariance matrix...')
            if use_filesystem:
                self.declare_covariance_preprocessing_in_progress()
            self.R = self.compute_covariance()
            if use_filesystem:
                pickle.dump(self.R, self.R_file(mode='wb'), 2)
        else:
            print('loading covariance matrix')
            self.R = pickle.load(self.R_file())

        if not self.invcovariance_preprocessing_in_progress(
        ) or not use_filesystem:
            print('creating inverse covariance matrix')
            if use_filesystem:
                self.declare_invcovariance_preprocessing_in_progress()
            self.Rri = self.compute_invcovariance()
            if use_filesystem:
                pickle.dump(self.Rri, self.Rri_file(mode='wb'), 2)
        else:
            print('loading inverse covariance matrix')
            self.Rri = pickle.load(self.Rri_file())

        t0 = time.time()
        print(time.time() - t0, ': creating and saving RA')
        self.A = SnpSubset(self.refpanel,
                           GenomicSubset(self.params.region).bedtool)
        self.RA = self.R.copy()
        self.RA.zero_outside_irs(self.A.irs)
        if use_filesystem:
            pickle.dump(self.RA, self.RA_file(mode='wb'), 2)

        print(time.time() - t0, ': computing and saving scaling')
        self.Z = self.Rri.dot(self.RA.dot(self.Rri))
        self.Q = self.R.dot(self.Z).dot(self.R)
        QA = self.Q.copy()
        QA.zero_outside_irs(self.A.irs)
        self.scalings = {
            r:
            len(self.A.irs & IntRangeSet(r)) / np.trace(QA.ranges_to_arrays[r])
            for r in QA.ranges()
        }
        print(time.time() - t0, ': scalings are', self.scalings)
        if use_filesystem:
            self.set_scalings(self.scalings)

        print(time.time() - t0, ': computing and saving bias matrix')
        self.ZR = self.RA.dot(self.Rri).dot(self.R).dot(self.Rri)
        if use_filesystem:
            pickle.dump(self.ZR, self.biasmatrix_file(mode='wb'), 2)

        print(time.time() - t0, ': variance matrices')
        self.QZ = self.Q.dot(self.Z)
        self.QZR = self.QZ.dot(self.R)
        if use_filesystem:
            self.save_variance_matrices(self.Q, self.Z, self.QZ, self.QZR)
        print(time.time() - t0, ': done')
Ejemplo n.º 9
0
 def ranges(self):
     ranges = zip(np.concatenate([[0], self.last_snps]), self.last_snps)
     if not self.remove_mhc:
         return ranges
     else:
         if self.mhc is None:
             self.mhc = SnpSubset(self.dataset, self.dataset.mhc_bedtool())
         return [r for r in ranges if (IntRangeSet(r) & self.mhc.irs).isempty]
Ejemplo n.º 10
0
            def compute_cov_for_snp(m):
                # we just compute the numbers needed for the top trianglular half
                # of the LD matrix, then we symmetrize the matrix. (commented line is old)
                # start = max(0, m - int(bandwidth/2))
                start = m
                end = min(slice_genotypes.shape[1], m + int(bandwidth / 2))

                window_indices = IntRangeSet(
                    (start, end)) & snpset_relative_to_slice
                window = slice_genotypes[:, window_indices]

                cov_to_snps_in_window = slice_genotypes[:, m].T.dot(
                    window) / len(indivs)
                cov_to_snps_in_window[
                    0] /= 2  # since we're going to symmetrize later

                target_indices = IntRangeSet(
                    (s[0] + start, s[0] + end)) & snpset_irs
                lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window
Ejemplo n.º 11
0
    def __init__(self, d, indivs, bandwidth, snpset_irs=None, output=False):
        if snpset_irs is None:
            snpset_irs = IntRangeSet((0, d.M))
        bandwidth = bandwidth + 1
        self.bandwidth = 2 * int(bandwidth / 2) + 1
        self.indivs = indivs
        lil_cov = sps.lil_matrix((d.M, d.M))

        def compute_cov_for_slice(s):
            indices = IntRangeSet(
                (s[0] if s[0] == 0 else s[0] + int(bandwidth / 2),
                 s[1] if s[1] == d.M else s[1] - int(bandwidth / 2)))
            indices = indices & snpset_irs

            if indices.isempty:  # if there are no indices to analyze then we can move on
                return
            print(s)
            slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs)
            snpset_relative_to_slice = IntRangeSet([
                (x - s[0], y - s[0]) for x, y in snpset_irs.ranges()
            ])

            def compute_cov_for_snp(m):
                # we just compute the numbers needed for the top trianglular half
                # of the LD matrix, then we symmetrize the matrix. (commented line is old)
                # start = max(0, m - int(bandwidth/2))
                start = m
                end = min(slice_genotypes.shape[1], m + int(bandwidth / 2))

                window_indices = IntRangeSet(
                    (start, end)) & snpset_relative_to_slice
                window = slice_genotypes[:, window_indices]

                cov_to_snps_in_window = slice_genotypes[:, m].T.dot(
                    window) / len(indivs)
                cov_to_snps_in_window[
                    0] /= 2  # since we're going to symmetrize later

                target_indices = IntRangeSet(
                    (s[0] + start, s[0] + end)) & snpset_irs
                lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window

            map(compute_cov_for_snp,
                it.show_progress([x - s[0] for x in indices]))

        map(compute_cov_for_slice, d.slices(buffer_size=int(bandwidth / 2)))

        from time import time
        t0 = time()
        if output: print('starting symmetrization and conversion to csr')
        self.covcsr = lil_cov.tocsr()
        self.covcsr = self.covcsr + self.covcsr.T

        if output: print('took time:', time() - t0)
Ejemplo n.º 12
0
        def compute_cov_for_slice(s):
            indices = IntRangeSet(
                (s[0] if s[0] == 0 else s[0] + int(bandwidth / 2),
                 s[1] if s[1] == d.M else s[1] - int(bandwidth / 2)))
            indices = indices & snpset_irs

            if indices.isempty:  # if there are no indices to analyze then we can move on
                return
            print(s)
            slice_genotypes = d.get_standardized_genotypes(s, indivs=indivs)
            snpset_relative_to_slice = IntRangeSet([
                (x - s[0], y - s[0]) for x, y in snpset_irs.ranges()
            ])

            def compute_cov_for_snp(m):
                # we just compute the numbers needed for the top trianglular half
                # of the LD matrix, then we symmetrize the matrix. (commented line is old)
                # start = max(0, m - int(bandwidth/2))
                start = m
                end = min(slice_genotypes.shape[1], m + int(bandwidth / 2))

                window_indices = IntRangeSet(
                    (start, end)) & snpset_relative_to_slice
                window = slice_genotypes[:, window_indices]

                cov_to_snps_in_window = slice_genotypes[:, m].T.dot(
                    window) / len(indivs)
                cov_to_snps_in_window[
                    0] /= 2  # since we're going to symmetrize later

                target_indices = IntRangeSet(
                    (s[0] + start, s[0] + end)) & snpset_irs
                lil_cov[s[0] + m, target_indices] = cov_to_snps_in_window

            map(compute_cov_for_snp,
                it.show_progress([x - s[0] for x in indices]))
Ejemplo n.º 13
0
    range_count = int(150000 * scale)
    position_count = int(3000000000 * scale)  #3 billion
    region_max_length = int(2000000 * scale)  #2 million

    np.random.seed(seed)
    for range_index in xrange(range_count):
        length = int(np.exp(np.random.random() * np.log(region_max_length)))
        start = randlong(position_count -
                         length)  #does randint really go up to 3 billin?
        stop = start + length
        yield start, stop


from pysnptools.util import IntRangeSet

geneset = IntRangeSet()
for start, stop in region_gen(scale=.1):
    geneset |= (start, stop)
print geneset
print geneset.ranges_len

print("done")

os.chdir(r"C:\Source\carlk\fastlmm2\tests\datasets\synth")

from pysnptools.snpreader import Bed

# Use "Bed" to access file "all.bed"
snpreader = Bed("all.bed")

# What is snpreader?
Ejemplo n.º 14
0
class SnpSubset(object):
    def __init__(self, dataset, bedtool=None, irs=None):
        # use bedtools to create an indicator vector for the snps membership in the subset
        self.dataset = dataset
        if bedtool:
            indicator = dataset.snp_coords().intersect(bedtool, c=True)
            self.irs = IntRangeSet(np.flatnonzero(
                np.array([int(snp.name) for snp in indicator])))
        elif irs:
            self.irs = irs
        else:
            self.irs = IntRangeSet()

    def num_snps(self):
        return len(self.irs)

    def expand_by(self, expansion_in_each_direction, units='Morgans'):
        result = IntRangeSet()
        for r in self.irs.ranges():
                result += self.dataset.buffer_around_slice(
                        r, expansion_in_each_direction, units=units)
        self.irs = result

    def expanded_by(self, expansion_in_each_direction, units='Morgans'):
        result = copy.copy(self)
        result.expand_by(expansion_in_each_direction, units=units)
        return result

    # prints subsets in the appropriate format for ldsc
    # all subsets must have the same dataset
    @classmethod
    def print_subsets(cls, outfilename, snpsubsets, names, add_other=False):
        def snp_info_df(d):
            bfile = d.genotypes_bedfile.filename
            return pd.read_csv(bfile + '.bim',
                    delim_whitespace=True,
                    usecols=[0,1,2,3],
                    names=['CHR','SNP','CM','BP'])

        # check that all snpsubsets have the same data set
        if len(set([ss.dataset for ss in snpsubsets])) > 1:
            print('error: all subsets must have the same underlying dataset')
            return
        if not outfilename.endswith('.gz'):
            print('outfilename must end with ".gz". I only write zipped files')
            return

        # get snp info for this dataset
        d = snpsubsets[0].dataset
        df = snp_info_df(d)

        # add the 'other' annotation if necessary
        if add_other:
            union = IntRangeSet()
            for ss in snpsubsets:
                union.update(ss.irs)
            snpsubsets.append(SnpSubset(d, irs=d.all_snps() - union))
            names.append('OTHER')

        # create the pandas dataframe and output it
        for name, ss in zip(names, snpsubsets):
            df[name] = 0
            df.ix[[i for i in ss.irs], name] = 1
        df = df[['CHR','BP','SNP','CM'] + names]
        with gzip.open(outfilename, 'wt') as write_file:
            df.to_csv(write_file, index=False, sep='\t')
Ejemplo n.º 15
0
                        help='the number of SNPs to use')
    parser.add_argument('-check_dense', action='store_true', default=False)
    args = parser.parse_args()

    d = Dataset('GERA', forced_M=args.M)
    indivs = d.random_indivs(200)

    t0 = time()
    R = LdMatrix(d, indivs, 200)
    R.add_ridge(0.05)
    print('computing R took', time() - t0)
    print('shape of R is:', R.covcsr.shape)

    # tiny = GenomicSubset('tiny')
    # tiny_irs = SnpSubset(d, bedtool=tiny.bedtool).irs
    tiny_irs = IntRangeSet('300:350')
    RA = LdMatrix(d, indivs, 200, snpset_irs=tiny_irs, output=False)
    b = np.random.randn(d.M)

    # check inverse computation
    t0 = time()
    Rinvb = R.solve_banded(b)
    print('R^{-1}b took', time() - t0)
    if args.check_dense:
        Rinvb_dense = np.linalg.solve(R.covcsr.toarray(), b)
        print('R^{-1}b behaves well:', np.allclose(Rinvb, Rinvb_dense))

    t0 = time()
    TrRinvRA = R.trace_of_inverse_times_matrix(RA)
    print('Tr(Rinv*RA) took', time() - t0)
    if args.check_dense:
Ejemplo n.º 16
0
    return result

def interval_from_range(r):
    return Interval(
            'chr'+str(int(d.genotypes_bedfile.pos[r[0]][0])),
            int(d.genotypes_bedfile.pos[r[0]][2]),
            int(d.genotypes_bedfile.pos[r[1]][2])-1)

d = Dataset(args.dataset)
A = SnpSubset(d, GenomicSubset(args.subset).bedtool)
if args.path_to_R is not None:
    R = pickle.load(open(args.path_to_R))
else:
    R = None

newA = IntRangeSet()
for r in A.expanded_by(0.003).irs.ranges():
    S = IntRangeSet([a-r[0] for a in A.irs & IntRangeSet(r)])
    print(r, 'analyzing', len(S), 'snps')
    if R is None:
        X = d.get_standardized_genotypes(r)
        cov = X.T.dot(X) / d.N
    else:
        cov = R.ranges_to_arrays[r]
    while True:
        new = get_high_ld_snps(S, cov)
        if len(new) == 0:
            break
        else:
            print('\tadding', len(new), 'snps')
            print('\t\tbefore', S)
Ejemplo n.º 17
0
 def expand_by(self, expansion_in_each_direction, units='Morgans'):
     result = IntRangeSet()
     for r in self.irs.ranges():
             result += self.dataset.buffer_around_slice(
                     r, expansion_in_each_direction, units=units)
     self.irs = result
Ejemplo n.º 18
0
 def all_snps(self):
     return IntRangeSet((0, self.M))
Ejemplo n.º 19
0
 def indices_containing(self, irs):
     ranges = self.ranges()
     return [i for i, r in enumerate(ranges) if not (IntRangeSet(r) & irs).isempty]
Ejemplo n.º 20
0
    def compute_variance(self, alphahat, point_estimate, N, use_beta=False):
        def term1_coeff(r):
            return 4 * (1 / N + self.c(r)) + 4 / N * (1 / N + self.c(r))

        def term2_coeff():
            return 4 / N + 2 / N**2

        def term3_coeff(r):
            return 2 * (1 / N + self.c(r))**2

        # compute the term that doesn't depend on beta
        variance3 = sum([
            self.scalings[r]**2 * np.trace(self.QZ.ranges_to_arrays[r]) * \
                    term3_coeff(r)
            for r in self.Q.ranges()])

        # now compute the other two terms
        if use_beta:
            beta = self.beta
            # term A: beta^T RZRZR beta = beta^T QZR beta
            variance1 = sum([
                self.scalings[r]**2 * \
                    beta[r[0]:r[1]].dot(self.QZR.ranges_to_arrays[r].dot(beta[r[0]:r[1]])) * \
                    term1_coeff(r)
                for r in self.Q.ranges()])
            # term B: (beta^T Q beta)^2
            variance2 = sum([
                self.scalings[r]**2 * \
                    beta[r[0]:r[1]].dot(self.Q.ranges_to_arrays[r].dot(beta[r[0]:r[1]]))**2 * \
                    term2_coeff()
                for r in self.Q.ranges()])
        else:
            # term A
            # alphahatTZR = alphahat.dot(self.ZR)
            # Zalphahat = self.Z.dot(alphahat)
            # termAbiases = {r :
            #     np.einsum('ij,ji',self.ZR.ranges_to_arrays[r],self.ZR.ranges_to_arrays[r]) * \
            #             (1/N + self.c(r))
            #     for r in self.Q.ranges()}
            # variance1 = sum([
            #     (alphahatTZR.ranges_to_arrays[r].dot(Zalphahat.ranges_to_arrays[r]) - \
            #             termAbiases[r]) * \
            #             term1_coeff(r)
            #     for r in self.Q.ranges()])
            betahat = self.Rri.dot(alphahat)
            point_estimates = {
                r: self.scalings[r] *
                (betahat.ranges_to_arrays[r].dot(self.RA.ranges_to_arrays[r]).
                 dot(betahat.ranges_to_arrays[r]) - self.biases[r])
                for r in self.Q.ranges()
            }
            variance1 = sum([
                self.scalings[r]**2 * point_estimates[r] / len(self.A.irs & IntRangeSet(r)) * \
                    np.trace(self.QZR.ranges_to_arrays[r]) * \
                    term1_coeff(r)
                for r in self.Q.ranges()])

            # term B
            variance2 = term2_coeff() * sum([
                self.scalings[r]**2 *
                (point_estimates[r] / self.scalings[r])**2
                for r in self.Q.ranges()
            ])

        variance = variance1 + variance2 + variance3
        print('\nvariance is {} + {} + {} = {}'.format(variance1, variance2,
                                                       variance3, variance))
        return variance