Example #1
0
    def _upload_big_file(self, big_size, path_list):
        def mapper(path):
            azure_path = "{0}/big{1}.txt".format(path, big_size)
            if not self.container.file_exists(azure_path):
                logging.info("Uploading '{0}'".format(azure_path))
                temp_parent = os.path.join(tempfile.gettempdir(),
                                           format(hash(os.times())))
                file_name = temp_parent + azure_path
                logging.info("Creating {0}".format(file_name))
                pstutil.create_directory_if_necessary(file_name)
                with open(
                        file_name, "wb"
                ) as fp:  #Preallocate the local file to its full size
                    if big_size > 0:
                        fp.seek(big_size)
                        fp.write("\0")
                with _file_transfer_reporter("'{0}'".format(azure_path),
                                             big_size) as updater:
                    self.container.upload(file_name,
                                          azure_path,
                                          updater=updater)
                os.remove(file_name)
            return azure_path

        azure_path_list = map_reduce(path_list,
                                     mapper=mapper,
                                     runner=LocalMultiThread(len(path_list)))
        return azure_path_list
Example #2
0
def prime_search1(start, stop, runner):
    '''
    Distributed algorithm for finding prime numbers in a range, but just testing each number.

    >>> from pysnptools.util.mapreduce1.examples import prime_search1
    >>> from pysnptools.util.mapreduce1.runner import LocalMultiProc
    >>> prime_search1(2,10,LocalMultiProc(4))
    [2, 3, 5, 7]
    '''
    from pysnptools.util.mapreduce1 import map_reduce

    def mapper(i):
        if is_prime(i):
            return i
        else:
            return None

    def reducer(sequence):
        result = []
        for i in sequence:
            if i is not None:
                result.append(i)
        return result

    return map_reduce(
        range(start, stop),
        mapper=mapper,
        reducer=
        reducer,  #lambda sequence: [i for i in sequence if i is not None], #Filter out the None's
        name="prime_search1",
        runner=runner)
Example #3
0
def mmultfile_ata(memmap_lambda,
                  writer,
                  sid,
                  work_count,
                  name,
                  runner,
                  force_python_only=False):
    sid_count = len(sid)
    piece_count = work_count * 2
    log_frequency = 1

    def debatch_closure(piece_index):
        return sid_count * piece_index // piece_count

    def mapper_closure(work_index):
        memmap = memmap_lambda()
        piece_index0 = work_index
        piece_index1 = piece_count - work_index - 1
        gtg_piece0 = mmultfile_ata_piece(memmap.filename,
                                         memmap.offset,
                                         piece_index0,
                                         piece_count,
                                         log_frequency=log_frequency,
                                         force_python_only=force_python_only)
        gtg_piece1 = mmultfile_ata_piece(memmap.filename,
                                         memmap.offset,
                                         piece_index1,
                                         piece_count,
                                         log_frequency=log_frequency,
                                         force_python_only=force_python_only)
        return [[piece_index0, gtg_piece0], [piece_index1, gtg_piece1]]

    def reducer_closure(result_result_sequence):
        logging.info("starting ata reducer")
        iid = [[value, value] for value in sid]
        gtg_data = KernelData(iid=iid, val=np.zeros((sid_count, sid_count)))
        for result_result in result_result_sequence:
            for piece_index, gtg_piece in result_result:
                logging.info("combining ata reducer {0}".format(piece_index))
                start = debatch_closure(piece_index)
                stop = debatch_closure(piece_index + 1)
                gtg_data.val[start:, start:stop] = gtg_piece
                gtg_data.val[start:stop,
                             start + gtg_piece.shape[1]:] = gtg_piece[
                                 gtg_piece.shape[1]:, :].T
        result = writer(gtg_data)

        return result

    gtg_npz_lambda = map_reduce(xrange(work_count),
                                mapper=mapper_closure,
                                reducer=reducer_closure,
                                runner=runner,
                                name=name,
                                input_files=[],
                                output_files=[])
Example #4
0
        def holder1(n, runner):
            def mapper1(x):
                return x * x

            def reducer1(sequence):
                return sum(sequence)

            return map_reduce(range(n),
                              mapper=mapper1,
                              reducer=reducer1,
                              runner=runner)
Example #5
0
        def holder1(n, runner):
            def mapper1(x):
                return int(os.environ['TEST_ENVIRON'])

            def reducer1(sequence):
                return sum(sequence) + int(os.environ['TEST_ENVIRON'])

            return map_reduce(range(n),
                              mapper=mapper1,
                              reducer=reducer1,
                              runner=runner)
Example #6
0
    def upload(self, local_path, azure_path, do_sync_date=True, updater=None):
        """
        Upload a local file to the container.
        """
        assert os.path.exists(
            local_path), 'Expect local_path to exist: "{0}"'.format(local_path)
        #self._run_once()
        t0 = time.time()
        self.remove(azure_path)

        size = os.path.getsize(local_path)
        piece_count = self._get_piece_count(size)

        with _file_transfer_reporter("upload", size,
                                     updater=updater) as updater2:

            def mapper_closure(piece_index):
                t00 = time.time()
                start = size * piece_index // piece_count
                stop = size * (piece_index + 1) // piece_count
                shard_size = stop - start
                blob_name = "{0}/{1}.{2}".format(azure_path, piece_index,
                                                 piece_count)
                self._create_blob_from_stream(local_path, start, stop,
                                              blob_name)
                updater2(shard_size)
                if piece_index == piece_count - 1:
                    self._create_blob_from_stream(
                        local_path, stop, stop,
                        "{0}/exists.txt".format(azure_path))

            map_reduce(
                range(piece_count),
                mapper=mapper_closure,
                runner=self._get_runner(),
            )

        if do_sync_date:
            self._sync_date(azure_path, local_path)
Example #7
0
    def _big_files_fileshare_internal(big_size, n, runner, storage, double_it):
        nn = n * 2 if double_it else n

        def mapper(ii):
            i = ii // 2 if double_it else i
            if ii % 2 == 0 or not double_it:
                short_name = "big{0}.{1}.txt".format(big_size, i)
                if storage.file_exists(short_name):
                    storage.remove(short_name)
                with storage.open_write(short_name) as file_name:
                    with open(
                            file_name, "wb"
                    ) as fp:  #Preallocate the local file to its full size
                        if big_size > 0:
                            fp.seek(big_size)
                            fp.write("\0")

            if ii % 2 == 1 or not double_it:
                next_name = "big{0}.{1}.txt".format(big_size, (i + 1) % n)
                logging.info("Transferring {0}".format(next_name))

                sleep_time = 5.0
                for j in xrange(50):
                    if storage.file_exists(next_name):
                        break
                    logging.info(
                        "Waiting for '{0}' to exist. Will sleep {1}".format(
                            next_name, sleep_time))
                    time.sleep(sleep_time)
                    sleep_time = min(60.0, sleep_time * 1.1)
                assert storage.file_exists(
                    next_name), "{0} still doesn't exist".format(next_name)

                t2 = time.time()
                with storage.open_read(next_name) as file_name:
                    pass
                mbps2 = _mbps(big_size, time.time() - t2)
                logging.info("transfers Mbps={0}".format(mbps2))

                return Mbps2
            return None

        mbps_list = map_reduce(
            xrange(nn),
            mapper=mapper,
            reducer=lambda sequence: [x for x in sequence if x is not None],
            name="big_filename.{0}{1}".format(n, ".x2" if double_it else ""),
            runner=runner)

        return mbps_list
Example #8
0
    def _gwas_2k_via_loo_chrom(test_snps, chrom_list, input_files, runner, G,
                               chrom_index_to_best_sid, pheno, covar,
                               force_full_rank, force_low_rank, mixing, h2,
                               output_file_name, GB_goal):
        logging.info("Doing GWAS_2K for each chrom. Work_count={0}".format(
            len(chrom_list)))

        def mapper_single_snp_2K_given_chrom(test_chr):
            logging.info("Working on chr={0}".format(test_chr))
            test_snps_chrom = test_snps[:, test_snps.pos[:, 0] == test_chr]
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader
            chrom_index = chrom_list.index(test_chr)
            best_sid = chrom_index_to_best_sid[chrom_index]

            K1 = G_for_chrom[:, G_for_chrom.sid_to_index(best_sid)]
            result = single_snp(test_snps=test_snps_chrom,
                                K0=G_for_chrom,
                                K1=K1,
                                pheno=pheno,
                                covar=covar,
                                leave_out_one_chrom=False,
                                GB_goal=GB_goal,
                                force_full_rank=force_full_rank,
                                force_low_rank=force_low_rank,
                                mixing=mixing,
                                h2=h2,
                                count_A1=count_A1)
            return result

        def reducer_closure(
                frame_sequence):  #!!!very similar code in single_snp
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(G.iid_count))
            logging.info("SNPCount\t{0}".format(G.sid_count))

            return frame

        frame = map_reduce(chrom_list,
                           mapper=mapper_single_snp_2K_given_chrom,
                           reducer=reducer_closure,
                           input_files=input_files,
                           name="single_snp with two K's for all chroms",
                           runner=runner)
        return frame
Example #9
0
    def _big_files_slow_down_internal(container, big_size, n, azure_path_list,
                                      runner, storage):
        def mapper(i):
            azure_path = azure_path_list[i % len(azure_path_list)]
            short_name = "big{0}.{1}.txt".format(big_size, i)
            if storage.file_exists(short_name):
                storage.remove(short_name)
            with storage.open_write(short_name, size=big_size) as file_name:
                logging.info("Downloading {0}".format(azure_path))
                with _file_transfer_reporter(
                        "Downloading {0}".format(azure_path),
                        big_size) as updater:
                    t0 = time.time()
                    container.download(azure_path, file_name, updater=updater)
                    mbps0 = _mbps(big_size, time.time() - t0)
            return mbps0

        mbps_list = map_reduce(xrange(n),
                               mapper=mapper,
                               name="big_files_slow_down",
                               runner=runner)

        return mbps_list
Example #10
0
            def mapper_closure(chrom):
                chrom_reader = snpreader[:, snpreader.pos[:, 0] == chrom]

                def nested_closure(piece_per_chrom_index):
                    start = chrom_reader.sid_count * piece_per_chrom_index // piece_per_chrom_count
                    stop = chrom_reader.sid_count * (
                        piece_per_chrom_index + 1) // piece_per_chrom_count
                    piece_reader = chrom_reader[:, start:stop]
                    _piece_name_list = [
                        "chrom{0}.piece{1}of{2}.{3}".format(
                            int(chrom), piece_per_chrom_index,
                            piece_per_chrom_count, suffix)
                        for suffix in ['bim', 'fam', 'bed']
                    ]
                    exist_list = [
                        storage.file_exists(_piece_name)
                        for _piece_name in _piece_name_list
                    ]
                    if sum(
                            exist_list
                    ) < 3:  #If all three of the BIM/FAM/BED files are already there, then skip the upload, otherwise do the upload
                        for i in range(
                                3
                        ):  #If one or two of BIM/FAM/BED are there, remove them
                            if exist_list[i]:
                                storage.remove(_piece_name_list[i])
                        _Distributed1Bed.write(_piece_name_list[-1],
                                               storage,
                                               piece_reader.read(),
                                               count_A1=count_A1,
                                               updater=updater2)
                    return _piece_name_list[-1]

                return map_reduce(
                    range(piece_per_chrom_count),
                    mapper=nested_closure,
                )
Example #11
0
    part_pair_count = (part_count*part_count+part_count)//2
    part_pair_index = -1
    print("part_pair_count={0:,}".format(part_pair_count))

    K0 = SnpKernel(synbed,standardizer=Unit()).read() #Precompute the similarity

    start_time = datetime.datetime.now()
    for i,part_i in enumerate(part_list):
        def mapper1(j):
            #from fastlmm.association import single_snp
            #from pysnptools.snpreader import Pairs
            #print('Z')
            #part_j = part_list[j]
            #print('A')
            print("Looking at pair {0},{1} which is {2} of {3}".format(i,j,part_pair_index+j+1,part_pair_count))
            #pairs = Pairs(part_i) if i==j else Pairs(part_i,part_j)
            #result_df_ij = single_snp(pairs, K0=K0, pheno=pheno_fn, covar=cov_fn, leave_out_one_chrom=False, count_A1=True)
            #print(result_df_ij[:1])
            #return result_df_ij

        result_df_i = map_reduce(range(i,part_count),
                                 mapper=mapper1,
                                 reducer=lambda result_j_list:pd.concat(result_j_list),
                                 runner=runner,
                                 name='js')
        part_pair_index+=(part_count-i)
        time_so_far = datetime.datetime.now()-start_time
        total_time_estimate = time_so_far*part_pair_count/(part_pair_index+1)
        print(total_time_estimate)

Example #12
0
        def mapper_find_best_given_chrom(test_chr):
            G_for_chrom = _K_per_chrom(G, test_chr, G.iid).snpreader

            def mapper_gather_lots(i_fold_and_pair):
                i_fold, (train_idx, test_idx) = i_fold_and_pair
                logging.info(
                    "Working on GWAS_1K and k search, chrom={0}, i_fold={1}".
                    format(test_chr, i_fold))

                G_train = G_for_chrom[train_idx, :]

                #Precompute whole x whole standardized on train
                from fastlmm.association.single_snp import _internal_determine_block_size, _block_size_from_GB_goal
                min_count = _internal_determine_block_size(
                    G_for_chrom, None, None, force_full_rank, force_low_rank)
                block_size = _block_size_from_GB_goal(GB_goal,
                                                      G_for_chrom.iid_count,
                                                      min_count)
                K_whole_unittrain = _SnpWholeWithTrain(
                    whole=G_for_chrom,
                    train_idx=train_idx,
                    standardizer=Unit(),
                    block_size=block_size).read()

                assert np.array_equal(K_whole_unittrain.iid,
                                      G_for_chrom.iid), "real assert"
                K_train = K_whole_unittrain[train_idx]

                single_snp_result = single_snp(
                    test_snps=G_train,
                    K0=K_train,
                    pheno=
                    pheno,  #iid intersection means when can give the whole covariate and pheno
                    covar=covar,
                    leave_out_one_chrom=False,
                    GB_goal=GB_goal,
                    force_full_rank=force_full_rank,
                    force_low_rank=force_low_rank,
                    mixing=mixing,
                    h2=h2,
                    count_A1=count_A1)

                is_all = (i_fold == n_folds) if n_folds > 1 else True

                k_list_in = [0] + [
                    int(k)
                    for k in k_list if 0 < k and k < len(single_snp_result)
                ]

                if is_all:
                    top_snps = list(single_snp_result.SNP[:max_k])
                else:
                    top_snps = None

                if i_fold == n_folds:
                    k_index_to_nLL = None
                else:
                    k_index_to_nLL = []
                    for k in k_list_in:
                        top_k = G_for_chrom[:,
                                            G_for_chrom.sid_to_index(
                                                single_snp_result.SNP[:k])]
                        logging.info(
                            "Working on chr={0}, i_fold={1}, and K_{2}".format(
                                test_chr, i_fold, k))

                        top_k_train = top_k[train_idx, :] if k > 0 else None
                        fastlmm = FastLMM(force_full_rank=force_full_rank,
                                          force_low_rank=force_low_rank,
                                          GB_goal=GB_goal)
                        fastlmm.fit(
                            K0_train=K_train,
                            K1_train=top_k_train,
                            X=covar,
                            y=pheno,
                            mixing=mixing,
                            h2raw=h2
                        )  #iid intersection means when can give the whole covariate and pheno

                        top_k_test = top_k[test_idx, :] if k > 0 else None
                        K0_whole_test = K_whole_unittrain[:, test_idx]
                        nLL = fastlmm.score(
                            K0_whole_test=K0_whole_test,
                            K1_whole_test=top_k_test,
                            X=covar,
                            y=pheno
                        )  #iid intersection means when can give the whole covariate and pheno
                        k_index_to_nLL.append(nLL)

                if i_fold > 0:
                    k_list_in = None

                return k_list_in, top_snps, k_index_to_nLL

            def reducer_find_best(top_snps_and_k_index_to_nLL_sequence):
                #Starts fold_index+all -> k_index -> nll
                #Need:  k_index -> sum(fold_index -> nll)

                k_index_to_sum_nll = None
                top_snps_all = None
                k_list_in_all = None
                for i_fold, (k_list_in, top_snps, k_index_to_nLL) in enumerate(
                        top_snps_and_k_index_to_nLL_sequence):
                    if k_list_in is not None:
                        assert k_list_in_all is None, "real assert"
                        k_list_in_all = k_list_in
                        k_index_to_sum_nll = np.zeros(len(k_list_in))

                    if top_snps is not None:
                        assert top_snps_all is None, "real assert"
                        top_snps_all = top_snps

                    if k_index_to_nLL is not None:
                        assert i_fold < n_folds or n_folds == 1, "real assert"
                        for k_index, nLL in enumerate(k_index_to_nLL):
                            k_index_to_sum_nll[k_index] += nLL

                #find best # top_snps
                best_k = k_list_in_all[np.argmin(k_index_to_sum_nll)]
                logging.info("For chrom={0}, best_k={1}".format(
                    test_chr, best_k))
                if do_plot: _nll_plot(k_list_in_all, k_index_to_sum_nll)

                #Return the top snps from all
                result = top_snps_all[:best_k]
                return result

            i_fold_index_to_top_snps_and_k_index_to_nLL = map_reduce(
                _kfold(G_for_chrom.iid_count, n_folds, seed,
                       end_with_all=True),
                mapper=mapper_gather_lots,
                reducer=reducer_find_best)
            return i_fold_index_to_top_snps_and_k_index_to_nLL
def single_snp_select(
        test_snps,
        pheno,
        G=None,
        covar=None,
        k_list=None,
        n_folds=10,  #1 is special and means test on train
        just_return_selected_snps=False,
        seed=0,
        output_file_name=None,
        GB_goal=None,
        force_full_rank=False,
        force_low_rank=False,
        h2=None,
        runner=None,
        count_A1=None):
    """
    Function performing single SNP GWAS based on covariates (often PCs) and a similarity matrix constructed of the top *k* SNPs where
    SNPs are ordered via the PValue from :meth:`.single_snp_linreg` and *k* is determined via out-of-sample prediction. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param G: SNPs from which to create a similarity matrix of the top *k* SNPs. If not given, will use test_snps.
           Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
    :type G: `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param k_list: Values of *k* (in addition to 0) to test. Default to [1,2,4,8,...8192].
    :type k_list: list of numbers

    :param n_folds: Number of folds of cross validation to use for out-of-sample evaluation of various values of *k*. Default to 10.
    :type n_folds: number
    
    :param just_return_selected_snps: Instead of returning the results of GWAS, return the top *k* SNPs selected.
    :type just_return_selected_snps: list of strings

    :param seed: (optional) Random seed used to generate permutations for lrt G0 fitting.
    :type seed: number

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param h2: A parameter to LMM learning that tells how much weight to give the K's vs. the identity matrix, optional
            If not given will search for best value.
    :type h2: number

    :param runner: a `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_select
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util import example_file # Download and return local file name
    >>> from fastlmm.util import compute_auto_pcs
    >>> bed_fn = example_file("tests/datasets/synth/all.bed")
    >>> phen_fn = example_file("tests/datasets/synth/pheno_10_causals.txt")
    >>> covar = compute_auto_pcs(bed_fn,count_A1=False)
    >>> results_dataframe = single_snp_select(test_snps=bed_fn, G=bed_fn, pheno=phen_fn, covar=covar, GB_goal=2, count_A1=False)
    >>> print(results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe))
    snp495_m0_.01m1_.04 0.0 5000

    """
    with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:
        #!!!code similar to single_snp and feature_selection
        if force_full_rank and force_low_rank:
            raise Exception("Can't force both full rank and low rank")

        assert test_snps is not None, "test_snps must be given as input"

        if k_list is None:
            k_list = np.logspace(start=0, stop=13, num=14, base=2)

        test_snps, G, pheno, covar = _fixup(test_snps,
                                            G,
                                            pheno,
                                            covar,
                                            count_A1=count_A1)
        common_input_files = [test_snps, G, pheno, covar]

        k_list_in = [0] + [int(k) for k in k_list if 0 < k <= G.sid_count]

        def top_snps_for_each_fold_nested(kfold_item):
            fold_index, (train_idx, test_idx) = kfold_item
            _, G_in, pheno_in, covar_in = _fixup(test_snps,
                                                 G,
                                                 pheno,
                                                 covar,
                                                 count_A1=count_A1)
            nested = single_snp_linreg(G_in[train_idx, :],
                                       pheno_in[train_idx, :],
                                       covar_in[train_idx, :],
                                       GB_goal=GB_goal,
                                       max_output_len=max(k_list_in),
                                       count_A1=count_A1)
            return nested

        def top_snps_for_each_fold_reducer(dataframe_list):
            result = [list(dataframe['SNP']) for dataframe in dataframe_list]
            return result

        #Find top snps for each fold
        fold_index_to_top_snps = map_reduce(
            _kfold(G.iid_count,
                   n_folds,
                   seed,
                   end_with_all=True,
                   iid_to_index=G.iid_to_index),
            nested=top_snps_for_each_fold_nested,
            reducer=top_snps_for_each_fold_reducer,
            name="top_snps_for_each_fold",
            input_files=common_input_files,
            runner=runner)

        #=================================================
        # Start of definition of inner functions
        #=================================================
        def k_index_to_nLL_mapper(k):
            _, G_in, pheno_in, covar_in = _fixup(test_snps,
                                                 G,
                                                 pheno,
                                                 covar,
                                                 count_A1=count_A1)
            nll_sum = 0
            mse_sum = 0
            n_folds_in = 0
            for fold_index, (train_idx,
                             test_idx) in _kfold(G.iid_count,
                                                 n_folds,
                                                 seed,
                                                 end_with_all=False,
                                                 iid_to_index=G.iid_to_index):
                assert set(train_idx).isdisjoint(set(test_idx)), "real assert"
                top_snps_in_fold = fold_index_to_top_snps[fold_index][:k]
                sid_idx_in_fold = G_in.sid_to_index(top_snps_in_fold)
                G_train = G_in[train_idx, sid_idx_in_fold] if k > 0 else None
                fastlmm = FastLMM(force_full_rank=force_full_rank,
                                  force_low_rank=force_low_rank,
                                  GB_goal=GB_goal)
                fastlmm.fit(
                    K0_train=G_train,
                    X=covar_in[train_idx, :],
                    y=pheno_in[train_idx, :],
                    h2raw=h2
                )  #iid intersection means when can give the whole covariate and pheno
                G_test = G_in[
                    test_idx, sid_idx_in_fold] if k > 0 else KernelIdentity(
                        G_in.iid, G_in.iid[test_idx]
                    )  #!!! instead of this, which blows up when # of iids is large, should switch to linear regression model with k is 0
                nll, mse = fastlmm.score(
                    K0_whole_test=G_test,
                    X=covar_in[test_idx, :],
                    y=pheno_in[test_idx, :],
                    return_mse_too=True
                )  #iid intersection means when can give the whole covariate and pheno
                nll_sum += nll
                mse_sum += mse
                n_folds_in += 1
            logging.info("k={0},nLL={1},average mse={2}".format(
                k, nll_sum, mse_sum / n_folds_in))
            return nll_sum

        #=================================================
        # End of definition of inner functions
        #=================================================

        #find best # of top SNPs
        k_index_to_nLL = map_reduce(k_list_in,
                                    mapper=k_index_to_nLL_mapper,
                                    input_files=common_input_files,
                                    name="k_index_to_nLL",
                                    runner=runner)
        best_k = k_list_in[np.argmin(k_index_to_nLL)]

        top_snps = fold_index_to_top_snps[-1][:best_k]
        if just_return_selected_snps:
            return top_snps

        sid_idx = G.sid_to_index(top_snps)
        G_top = G[:, sid_idx]

        # Run GWAS with leave-one-chrom out
        single_snp_result = single_snp(test_snps=test_snps,
                                       K0=G_top,
                                       pheno=pheno,
                                       covar=covar,
                                       leave_out_one_chrom=True,
                                       GB_goal=GB_goal,
                                       force_full_rank=force_full_rank,
                                       force_low_rank=force_low_rank,
                                       h2=h2,
                                       output_file_name=output_file_name,
                                       runner=runner,
                                       count_A1=count_A1)

        return single_snp_result
Example #14
0
def single_snp_linreg(test_snps,
                      pheno,
                      covar=None,
                      max_output_len=None,
                      output_file_name=None,
                      GB_goal=None,
                      runner=None,
                      count_A1=None):
    """
    Function performing single SNP GWAS using linear regression. Will reorder and intersect IIDs as needed.

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__. If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__, for example, `Pheno <http://fastlmm.github.io/PySnpTools/#snpreader-pheno>`__ or `SnpData <http://fastlmm.github.io/PySnpTools/#snpreader-snpdata>`__.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io/PySnpTools/#snpreader-snpreader>`__ or a string


    :param max_output_len: Maximum number of Pvalues to return. Default to None, which means 'Return all'.
    :type max_output_len: number
    
    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text.
    :type output_file_name: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks of size iid_count,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io/PySnpTools/#util-mapreduce1-runner-runner>`__

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool

    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"


    :Example:

    >>> import logging
    >>> import numpy as np
    >>> from fastlmm.association import single_snp_linreg
    >>> from pysnptools.snpreader import Bed
    >>> from fastlmm.util import example_file # Download and return local file name
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = example_file("fastlmm/feature_selection/examples/toydata.phe")
    >>> test_snps = example_file("fastlmm/feature_selection/examples/toydata.5chrom.*","*.bed")
    >>> results_dataframe = single_snp_linreg(test_snps=test_snps, pheno=pheno_fn, count_A1=False)
    >>> print(results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe))
    null_576 1e-07 10000


    """
    with patch.dict('os.environ', {'ARRAY_MODULE': 'numpy'}) as _:

        assert test_snps is not None, "test_snps must be given as input"
        test_snps = _snps_fixup(test_snps, count_A1=count_A1)
        pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
        assert pheno.sid_count == 1, "Expect pheno to be just one variable"
        pheno = pheno[(pheno.val == pheno.val)[:, 0], :]
        covar = _pheno_fixup(covar, iid_if_none=pheno.iid)
        test_snps, pheno, covar = pstutil.intersect_apply(
            [test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(test_snps.iid_count))

        if GB_goal is not None:
            bytes_per_sid = test_snps.iid_count * 8
            sid_per_GB_goal = 1024.0**3 * GB_goal / bytes_per_sid
            block_size = max(1, int(sid_per_GB_goal + .5))
            block_count = test_snps.sid_count / block_size
        else:
            block_count = 1
            block_size = test_snps.sid_count
        logging.debug("block_count={0}, block_size={1}".format(
            block_count, block_size))

        #!!!what about missing data in covar, in test_snps, in y
        covar = np.c_[
            covar.read(view_ok=True, order='A').val,
            np.ones((test_snps.iid_count,
                     1))]  #view_ok because np.c_ will allocation new memory
        y = pheno.read(
            view_ok=True, order='A'
        ).val  #view_ok because this code already did a fresh read to look for any missing values

        def mapper(start):
            logging.info(
                "single_snp_linereg reading start={0},block_size={1}".format(
                    start, block_size))
            snp_index = np.arange(start,
                                  min(start + block_size, test_snps.sid_count))
            x = test_snps[:, start:start + block_size].read().standardize().val
            logging.info("single_snp_linereg linreg")
            _, pval_in = lin_reg.f_regression_cov_alt(x, y, covar)
            logging.info("single_snp_linereg done")
            pval_in = pval_in.reshape(-1)

            if max_output_len is None:
                return pval_in, snp_index
            else:  #We only need to return the top max_output_len results
                sort_index = np.argsort(pval_in)[:max_output_len]
                return pval_in[sort_index], snp_index[sort_index]

        def reducer(pval_and_snp_index_sequence):
            pval_list = []
            snp_index_list = []
            for pval, snp_index in pval_and_snp_index_sequence:
                pval_list.append(pval)
                snp_index_list.append(snp_index)
            pval = np.concatenate(pval_list)
            snp_index = np.concatenate(snp_index_list)
            sort_index = np.argsort(pval)
            if max_output_len is not None:
                sort_index = sort_index[:max_output_len]
            index = snp_index[sort_index]

            dataframe = pd.DataFrame(index=np.arange(len(index)),
                                     columns=('sid_index', 'SNP', 'Chr',
                                              'GenDist', 'ChrPos', 'PValue'))
            #!!Is this the only way to set types in a dataframe?
            dataframe['sid_index'] = dataframe['sid_index'].astype(np.float)
            dataframe['Chr'] = dataframe['Chr'].astype(np.float)
            dataframe['GenDist'] = dataframe['GenDist'].astype(np.float)
            dataframe['ChrPos'] = dataframe['ChrPos'].astype(np.float)
            dataframe['PValue'] = dataframe['PValue'].astype(np.float)

            dataframe['sid_index'] = index
            dataframe['SNP'] = np.array(
                test_snps.sid[index], dtype='str'
            )  #This will be ascii on Python2 and unicode on Python3
            dataframe['Chr'] = test_snps.pos[index, 0]
            dataframe['GenDist'] = test_snps.pos[index, 1]
            dataframe['ChrPos'] = test_snps.pos[index, 2]
            dataframe['PValue'] = pval[sort_index]

            if output_file_name is not None:
                dataframe.to_csv(output_file_name, sep="\t", index=False)

            return dataframe

        dataframe = map_reduce(range(0, test_snps.sid_count, block_size),
                               mapper=mapper,
                               reducer=reducer,
                               input_files=[test_snps, pheno, covar],
                               output_files=[output_file_name],
                               name="single_snp_linreg",
                               runner=runner)
        return dataframe
Example #15
0
    def download(
        self,
        azure_path,
        local_path,
        do_sync_date=True,
        as_needed=True,
        updater=None
    ):  #!!!perhaps should download to a tmp file and then rename after everything works.
        """
        Download a file from the container.

                _file_transfer_reporter    : is a python context manager what is initialized with a size and that yields a updater method that can be called with a byte count as the download progresses.

        """
        #self._run_once()

        if as_needed and not self._download_needed_and_ready(
                local_path, azure_path):
            return

        t0 = time.time()
        blob_list = self._find_blobs_and_check(azure_path)

        piece_count = len(blob_list)
        start_stop_pairs = []
        start = 0
        for _, _, blob in blob_list:
            stop = start + blob.properties.content_length
            start_stop_pairs.append((start, stop))
            start = stop
        size = start_stop_pairs[-1][1]  # The size is the last stop value

        pstutil.create_directory_if_necessary(local_path, isfile=True)
        local_path_temp = local_path + ".temp"  #!!! give it a unique name to ensure that it can't collide with a user's name.
        with open(local_path_temp,
                  "wb") as fp:  #Preallocate the local file to its full size
            if size > 0:
                fp.seek(size - 1)
                fp.write("\0")

        with _file_transfer_reporter("download", size,
                                     updater=updater) as updater:

            def mapper_closure(piece_index):
                blobetc = blob_list[piece_index]
                start, stop = start_stop_pairs[piece_index]
                logging.debug("\tDownloading {0}/{4} {1}-{2} in '{3}'".format(
                    piece_index, start, stop, local_path, piece_count))
                self._get_blobetc_to_stream(blobetc, local_path_temp, start,
                                            stop)
                updater(stop - start)

            name = "download." + os.path.basename(
                local_path) + datetime.datetime.utcnow().strftime(
                    "%Y%m%d-%H%M%S") + str(random.random())
            map_reduce(
                range(piece_count),
                mapper=mapper_closure,
                name=name,
                runner=self._get_runner(),
            )

        if do_sync_date:
            self._sync_date(azure_path, local_path_temp)
        self._rename_no_matter_what(local_path_temp, local_path)
Example #16
0
def single_snp(test_snps, pheno, K0=None,
                 K1=None, mixing=None,
                 covar=None, covar_by_chrom=None, leave_out_one_chrom=True, output_file_name=None, h2=None, log_delta=None,
                 cache_file = None, GB_goal=None, interact_with_snp=None, force_full_rank=False, force_low_rank=False, G0=None, G1=None, runner=None,
                 count_A1=None):
    """
    Function performing single SNP GWAS using cross validation over the chromosomes and REML. Will reorder and intersect IIDs as needed.
    (For backwards compatibility, you may use 'leave_out_one_chrom=False' to skip cross validation, but that is not recommended.)

    :param test_snps: SNPs to test. Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. 
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type test_snps: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param pheno: A single phenotype: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example,
           `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           Any IIDs with missing values will be removed.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type pheno: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param K0: SNPs from which to create a similarity matrix. If not given, will use test_snps.
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_. 
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_
           or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.)
    :type K0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string
           (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param K1: SNPs from which to create a second similarity matrix, optional. (Also, see 'mixing').
           Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_.
           If you give a string, it should be the base name of a set of PLINK Bed-formatted files.
           (When leave_out_one_chrom is False, can be a `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_
           or a `KernelNpz <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelnpz>`_-formated file name.)
    :type K1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string
           (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param mixing: Weight between 0.0 (inclusive, default) and 1.0 (inclusive) given to K1 relative to K0.
            If you give no mixing number and a K1 is given, the best weight will be learned.
    :type mixing: number

    :param covar: covariate information, optional: Can be any `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_, for example, `Pheno <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-pheno>`_ or `SnpData <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpdata>`_.
           If you give a string, it should be the file name of a PLINK phenotype-formatted file.
           (For backwards compatibility can also be dictionary with keys 'vals', 'iid', 'header')
    :type covar: a `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string

    :param leave_out_one_chrom: Perform single SNP GWAS via cross validation over the chromosomes. Default to True.
           (Warning: setting False can cause proximal contamination.)
    :type leave_out_one_chrom: boolean    

    :param output_file_name: Name of file to write results to, optional. If not given, no output file will be created. The output format is tab-delimited text.
    :type output_file_name: file name

    :param h2: A parameter to LMM learning, optional
            If not given will search for best value.
            If mixing is unspecified, then h2 must also be unspecified.
    :type h2: number

    :param log_delta: a re-parameterization of h2 provided for backwards compatibility. h2 is 1./(exp(log_delta)+1)
    :type log_delta: number

    :param cache_file: Name of  file to read or write cached precomputation values to, optional.
                If not given, no cache file will be used.
                If given and file does not exist, will write precomputation values to file.
                If given and file does exist, will read precomputation values from file.
                The file contains the U and S matrix from the decomposition of the training matrix. It is in Python's np.savez (\*.npz) format.
                Calls using the same cache file should have the same 'K0' and 'K1'
                If given and the file does exist then K0 and K1 need not be given.
    :type cache_file: file name

    :param GB_goal: gigabytes of memory the run should use, optional. If not given, will read the test_snps in blocks the same size as the kernel,
        which is memory efficient with little overhead on computation time.
    :type GB_goal: number

    :param interact_with_snp: index of a covariate to perform an interaction test with. 
            Allows for interaction testing (interact_with_snp x snp will be tested)
            default: None

    :param force_full_rank: Even if kernels are defined with fewer SNPs than IIDs, create an explicit iid_count x iid_count kernel. Cannot be True if force_low_rank is True.
    :type force_full_rank: Boolean

    :param force_low_rank: Even if kernels are defined with fewer IIDs than SNPs, create a low-rank iid_count x sid_count kernel. Cannot be True if force_full_rank is True.
    :type force_low_rank: Boolean

    :param G0: Same as K0. Provided for backwards compatibility. Cannot be given if K0 is given.
    :type G0: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param G1: Same as K1. Provided for backwards compatibility. Cannot be given if K1 is given.
    :type G1: `SnpReader <http://fastlmm.github.io.github.io/PySnpTools/#snpreader-snpreader>`_ or a string (or `KernelReader <http://fastlmm.github.io.github.io/PySnpTools/#kernelreader-kernelreader>`_)

    :param runner: a `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_, optional: Tells how to run locally, multi-processor, or on a cluster.
        If not given, the function is run locally.
    :type runner: `Runner <http://fastlmm.github.io.github.io/PySnpTools/#util-mapreduce1-runner-runner>`_

    :param count_A1: If it needs to read SNP data from a BED-formatted file, tells if it should count the number of A1
         alleles (the PLINK standard) or the number of A2 alleles. False is the current default, but in the future the default will change to True.
    :type count_A1: bool


    :rtype: Pandas dataframe with one row per test SNP. Columns include "PValue"



    :Example:

    >>> import logging
    >>> from fastlmm.association import single_snp
    >>> from pysnptools.snpreader import Bed
    >>> logging.basicConfig(level=logging.INFO)
    >>> pheno_fn = "../feature_selection/examples/toydata.phe"
    >>> results_dataframe = single_snp(test_snps="../feature_selection/examples/toydata.5chrom", pheno=pheno_fn, count_A1=False)
    >>> print results_dataframe.iloc[0].SNP,round(results_dataframe.iloc[0].PValue,7),len(results_dataframe)
    null_576 1e-07 10000


    """
    t0 = time.time()
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert test_snps is not None, "test_snps must be given as input"
    test_snps = _snps_fixup(test_snps, count_A1=count_A1)
    pheno = _pheno_fixup(pheno, count_A1=count_A1).read()
    assert pheno.sid_count == 1, "Expect pheno to be just one variable"
    pheno = pheno[(pheno.val==pheno.val)[:,0],:]
    covar = _pheno_fixup(covar, iid_if_none=pheno.iid, count_A1=count_A1)

    if not leave_out_one_chrom:
        assert covar_by_chrom is None, "When 'leave_out_one_chrom' is False, 'covar_by_chrom' must be None"
        K0 = _kernel_fixup(K0 or G0 or test_snps, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1)
        K1 = _kernel_fixup(K1 or G1, iid_if_none=test_snps.iid, standardizer=Unit(),count_A1=count_A1)
        K0, K1, test_snps, pheno, covar  = pstutil.intersect_apply([K0, K1, test_snps, pheno, covar])
        logging.debug("# of iids now {0}".format(K0.iid_count))
        K0, K1, block_size = _set_block_size(K0, K1, mixing, GB_goal, force_full_rank, force_low_rank)

        frame =  _internal_single(K0=K0, test_snps=test_snps, pheno=pheno,
                                    covar=covar, K1=K1,
                                    mixing=mixing, h2=h2, log_delta=log_delta,
                                    cache_file = cache_file, force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                    output_file_name=output_file_name,block_size=block_size, interact_with_snp=interact_with_snp,
                                    runner=runner)
        sid_index_range = IntRangeSet(frame['sid_index'])
        assert sid_index_range == (0,test_snps.sid_count), "Some SNP rows are missing from the output"
    else: 
        chrom_list = list(set(test_snps.pos[:,0])) # find the set of all chroms mentioned in test_snps, the main testing data
        assert not np.isnan(chrom_list).any(), "chrom list should not contain NaN"
        input_files = [test_snps, pheno, K0, G0, K1, G1, covar] + ([] if covar_by_chrom is None else covar_by_chrom.values())

        def nested_closure(chrom):
            test_snps_chrom = test_snps[:,test_snps.pos[:,0]==chrom]
            covar_chrom = _create_covar_chrom(covar, covar_by_chrom, chrom)
            cache_file_chrom = None if cache_file is None else cache_file + ".{0}".format(chrom)

            K0_chrom = _K_per_chrom(K0 or G0 or test_snps, chrom, test_snps.iid)
            K1_chrom = _K_per_chrom(K1 or G1, chrom, test_snps.iid)

            K0_chrom, K1_chrom, test_snps_chrom, pheno_chrom, covar_chrom  = pstutil.intersect_apply([K0_chrom, K1_chrom, test_snps_chrom, pheno, covar_chrom])
            logging.debug("# of iids now {0}".format(K0_chrom.iid_count))
            K0_chrom, K1_chrom, block_size = _set_block_size(K0_chrom, K1_chrom, mixing, GB_goal, force_full_rank, force_low_rank)

            distributable = _internal_single(K0=K0_chrom, test_snps=test_snps_chrom, pheno=pheno_chrom,
                                        covar=covar_chrom, K1=K1_chrom,
                                        mixing=mixing, h2=h2, log_delta=log_delta, cache_file=cache_file_chrom,
                                        force_full_rank=force_full_rank,force_low_rank=force_low_rank,
                                        output_file_name=None, block_size=block_size, interact_with_snp=interact_with_snp,
                                        runner=Local())
            
            return distributable

        def reducer_closure(frame_sequence):
            frame = pd.concat(frame_sequence)
            frame.sort_values(by="PValue", inplace=True)
            frame.index = np.arange(len(frame))
            if output_file_name is not None:
                frame.to_csv(output_file_name, sep="\t", index=False)
            logging.info("PhenotypeName\t{0}".format(pheno.sid[0]))
            logging.info("SampleSize\t{0}".format(test_snps.iid_count))
            logging.info("SNPCount\t{0}".format(test_snps.sid_count))
            logging.info("Runtime\t{0}".format(time.time()-t0))

            return frame

        frame = map_reduce(chrom_list,
                   mapper = nested_closure,
                   reducer = reducer_closure,
                   input_files = input_files,
                   output_files = [output_file_name],
                   name = "single_snp (leave_out_one_chrom), out='{0}'".format(output_file_name),
                   runner = runner)

    return frame
Example #17
0
def _internal_single(K0, test_snps, pheno, covar, K1,
                 mixing, h2, log_delta,
                 cache_file, force_full_rank, force_low_rank,
                 output_file_name, block_size, interact_with_snp, runner):
    assert K0 is not None, "real assert"
    assert K1 is not None, "real assert"
    assert block_size is not None, "real assert"
    assert mixing is None or 0.0 <= mixing <= 1.0
    if force_full_rank and force_low_rank:
        raise Exception("Can't force both full rank and low rank")

    assert h2 is None or log_delta is None, "if h2 is specified, log_delta may not be specified"
    if log_delta is not None:
        h2 = 1.0/(np.exp(log_delta)+1)

    covar = np.c_[covar.read(view_ok=True,order='A').val,np.ones((test_snps.iid_count, 1))]  #view_ok because np.c_ will allocation new memory

    y =  pheno.read(view_ok=True,order='A').val #view_ok because this code already did a fresh read to look for any missing values 

    if cache_file is not None and os.path.exists(cache_file):
        lmm = lmm_cov(X=covar, Y=y, G=None, K=None)
        with np.load(cache_file) as data: #!! similar code in epistasis
            lmm.U = data['arr_0']
            lmm.S = data['arr_1']
            h2 = data['arr_2'][0]
            mixing = data['arr_2'][1]
    else:
        K, h2, mixer = _Mixer.combine_the_best_way(K0, K1, covar, y, mixing, h2, force_full_rank=force_full_rank, force_low_rank=force_low_rank,kernel_standardizer=DiagKtoN())
        mixing = mixer.mixing

        if mixer.do_g:
            lmm = lmm_cov(X=covar, Y=y, K=None, G=K.snpreader.val, inplace=True)
        else:
            #print(covar.sum(),y.sum(),K.val.sum(),covar[0],y[0],K.val[0,0])
            lmm = lmm_cov(X=covar, Y=y, K=K.val, G=None, inplace=True)

        if h2 is None:
            result = lmm.findH2()
            h2 = result['h2']
        logging.info("h2={0}".format(h2))

        if cache_file is not None and not os.path.exists(cache_file):
            pstutil.create_directory_if_necessary(cache_file)
            lmm.getSU()
            np.savez(cache_file, lmm.U,lmm.S,np.array([h2,mixing])) #using np.savez instead of pickle because it seems to be faster to read and write

    if interact_with_snp is not None:
        logging.info("interaction with %i" % interact_with_snp)
        assert 0 <= interact_with_snp and interact_with_snp < covar.shape[1]-1, "interact_with_snp is out of range"
        interact = covar[:,interact_with_snp].copy()
        interact -=interact.mean()
        interact /= interact.std()
    else:
        interact = None

    work_count = -(test_snps.sid_count // -block_size) #Find the work count based on batch size (rounding up)

    # We define three closures, that is, functions define inside function so that the inner function has access to the local variables of the outer function.
    def debatch_closure(work_index):
        return test_snps.sid_count * work_index // work_count

    def mapper_closure(work_index):
        if work_count > 1: logging.info("single_snp: Working on snp block {0} of {1}".format(work_index,work_count))
        do_work_time = time.time()
        start = debatch_closure(work_index)
        end = debatch_closure(work_index+1)

        snps_read = test_snps[:,start:end].read().standardize()
        if interact_with_snp is not None:
            variables_to_test = snps_read.val * interact[:,np.newaxis]
        else:
            variables_to_test = snps_read.val
        res = lmm.nLLeval(h2=h2, dof=None, scale=1.0, penalty=0.0, snps=variables_to_test)

        beta = res['beta']
        
        chi2stats = beta*beta/res['variance_beta']
        #p_values = stats.chi2.sf(chi2stats,1)[:,0]
        assert test_snps.iid_count == lmm.U.shape[0]
        p_values = stats.f.sf(chi2stats,1,lmm.U.shape[0]-(lmm.linreg.D+1))[:,0]#note that G.shape is the number of individuals#

        dataframe = _create_dataframe(snps_read.sid_count)
        dataframe['sid_index'] = np.arange(start,end)
        dataframe['SNP'] = snps_read.sid
        dataframe['Chr'] = snps_read.pos[:,0]
        dataframe['GenDist'] = snps_read.pos[:,1]
        dataframe['ChrPos'] = snps_read.pos[:,2] 
        dataframe['PValue'] = p_values
        dataframe['SnpWeight'] = beta[:,0]
        dataframe['SnpWeightSE'] = np.sqrt(res['variance_beta'][:,0])
        dataframe['SnpFractVarExpl'] = np.sqrt(res['fraction_variance_explained_beta'][:,0])
        dataframe['Mixing'] = np.zeros((snps_read.sid_count)) + mixing
        dataframe['Nullh2'] = np.zeros((snps_read.sid_count)) + h2

        logging.info("time={0}".format(time.time()-do_work_time))

        #logging.info(dataframe)
        return dataframe

    def reducer_closure(result_sequence):
        if output_file_name is not None:
            create_directory_if_necessary(output_file_name)

        frame = pd.concat(result_sequence)
        frame.sort_values(by="PValue", inplace=True)
        frame.index = np.arange(len(frame))

        if output_file_name is not None:
            frame.to_csv(output_file_name, sep="\t", index=False)

        return frame

    frame = map_reduce(xrange(work_count),
                       mapper=mapper_closure,reducer=reducer_closure,
                       input_files=[test_snps],output_files=[output_file_name],
                       name="single_snp(output_file={0})".format(output_file_name),
                       runner=runner)
    return frame
Example #18
0
    def write(
        storage,
        snpreader,
        piece_per_chrom_count=1,
        updater=None,
        runner=None
    ):  #!!! might want to set pieces_per_chrom such that it is a certain size
        '''
        Uploads from any :class:`.Bed`-like data to cluster storage for efficient retrieval later.
        If some of the contents already exists in storage, it skips uploading that part of the contents. (To avoid this behavior,
        clear the storage.)

        :param storage: Tells where to store SNP data.
                      A string can be given and will be interpreted as the path of a local directory to use for storage. (The local
                      directory will **not** be automatically erased and so must be user managed.) 
                      A :class:`.FileCache` instance can be given, which provides a
                      method to specify cluster-distributed storage. (:class:`.FileCache`'s will **not** be automatically erased and must be user managed.)
                      If `None`, the storage will be in an automatically-erasing temporary directory. (If the TEMP environment variable is set, Python places the temp directory under it.)
                      
        :type storage: string or :class:`.FileCache` or None.

        :param snpreader: A :class:`.Bed` or other :class:`.SnpReader` with values of 0,1,2, or missing.
            (Note that this differs from most other `write` methods that take a :class:`.SnpData`)
        :type snpreader: :class:`.SnpReader`

        :param piece_per_chrom_count: The number of pieces in which to store the data from each chromosome. Data is split across
            SNPs. For exmple, if `piece_per_chrom_count` is set to 100 and 22 chromosomes are uploaded, then data will be stored in 2200 pieces. Later, when data is requested
            only the pieces necessary for the request will be downloaded to local storage.
        :type piece_per_chrom_count: A number

        :param updater: A single argument function to write logging message to, for example, the function created by :func:`.log_in_place`.
        :type updater: A function or lambda

        :param runner: a :class:`.Runner`, optional: Tells how to run.
            (Note that :class:`.Local` and :class:`.LocalMultProc` are good options.)
            If not given, the function is run locally.
        :type runner: :class:`.Runner`

        :rtype: DistributedBed

        >>> from pysnptools.snpreader import DistributedBed, Bed
        >>> import shutil
        >>> from pysnptools.util import example_file # Download and return local file name
        >>> directory = 'tempdir/toydataSkip10.distributedbed'
        >>> if os.path.exists(directory):
        ...     shutil.rmtree(directory)
        >>> bedfile = example_file("pysnptools/examples/toydata.5chrom.*","*.bed")
        >>> snpreader = Bed(bedfile,count_A1=False)[:,::10]  # Read every 10 snps from Bed format
        >>> DistributedBed.write(directory,snpreader,piece_per_chrom_count=5)  # Write data in DistributedBed format
        DistributedBed(LocalCache('tempdir/toydataSkip10.distributedbed'))


        '''
        from pysnptools.util import _file_transfer_reporter
        from pysnptools.util.filecache import FileCache

        count_A1 = True  #Make all these's the same for reading and writing so that nothing will change.
        snpreader = _snps_fixup(snpreader, count_A1=count_A1)

        storage = FileCache._fixup(storage)

        chrom_set = sorted(set(snpreader.pos[:, 0]))
        for chrom in chrom_set:
            assert chrom == chrom and chrom == int(
                chrom
            ), "DistributedBed.write expects all chromosomes to be integers (not '{0}')".format(
                chrom)
        with _file_transfer_reporter("DistributedBed.write",
                                     size=0,
                                     updater=updater) as updater2:

            def mapper_closure(chrom):
                chrom_reader = snpreader[:, snpreader.pos[:, 0] == chrom]

                def nested_closure(piece_per_chrom_index):
                    start = chrom_reader.sid_count * piece_per_chrom_index // piece_per_chrom_count
                    stop = chrom_reader.sid_count * (
                        piece_per_chrom_index + 1) // piece_per_chrom_count
                    piece_reader = chrom_reader[:, start:stop]
                    _piece_name_list = [
                        "chrom{0}.piece{1}of{2}.{3}".format(
                            int(chrom), piece_per_chrom_index,
                            piece_per_chrom_count, suffix)
                        for suffix in ['bim', 'fam', 'bed']
                    ]
                    exist_list = [
                        storage.file_exists(_piece_name)
                        for _piece_name in _piece_name_list
                    ]
                    if sum(
                            exist_list
                    ) < 3:  #If all three of the BIM/FAM/BED files are already there, then skip the upload, otherwise do the upload
                        for i in range(
                                3
                        ):  #If one or two of BIM/FAM/BED are there, remove them
                            if exist_list[i]:
                                storage.remove(_piece_name_list[i])
                        _Distributed1Bed.write(_piece_name_list[-1],
                                               storage,
                                               piece_reader.read(),
                                               count_A1=count_A1,
                                               updater=updater2)
                    return _piece_name_list[-1]

                return map_reduce(
                    range(piece_per_chrom_count),
                    mapper=nested_closure,
                )

            list_list_pair = map_reduce(
                chrom_set,
                nested=mapper_closure,
                runner=runner,
            )

        reader_name_list = []
        reader_list = []
        for chrom_result in list_list_pair:
            for _piece_name in chrom_result:
                reader_name_list.append(_piece_name)
                reader_list.append(_Distributed1Bed(_piece_name, storage))

        _metadatanpz = "metadata.npz"
        with storage.open_write(_metadatanpz) as local_metadatanpz:
            _reader_name_listnpz = "reader_name_list.npz"
            with storage.open_write(
                    _reader_name_listnpz) as local_reader_name_listnpz:
                reader_name_list_ascii = np.array(reader_name_list, dtype='S')
                np.savez(local_reader_name_listnpz,
                         reader_name_list=reader_name_list_ascii)
                if os.path.exists(local_metadatanpz):
                    os.remove(local_metadatanpz)
                _MergeSIDs(reader_list,
                           cache_file=local_metadatanpz,
                           skip_check=True)

        return DistributedBed(storage)