def dataframe_to_keytable(self, df, keys=[]): """Convert Spark SQL DataFrame to KeyTable. Spark SQL data types are converted to Hail types in the obvious way as follows: .. code-block:: text BooleanType => Boolean IntegerType => Int LongType => Long FloatType => Float DoubleType => Double StringType => String BinaryType => Binary ArrayType => Array StructType => Struct Unlisted Spark SQL data types are currently unsupported. :param keys: List of key column names. :type keys: list of string :return: The DataFrame as a KeyTable. :rtype: :class:`.KeyTable` """ jkeys = jarray(self._jvm.java.lang.String, keys) return KeyTable(self, self._hail.keytable.KeyTable.fromDF(df._jdf, jkeys))
def _run_command(self, vds, pargs): jargs = jarray(self._jvm.java.lang.String, pargs) t = self._hail.driver.ToplevelCommands.lookup(jargs) cmd = t._1() cmd_args = t._2() jstate = self._jstate(vds._jvds if vds != None else None) try: result = cmd.run(jstate, cmd_args) except Py4JJavaError as e: raise_py4j_exception(e) return VariantDataset(self, result.vds())
def import_keytable(self, path, key_names, npartitions=None, config=None): """Import delimited text file (text table) as KeyTable. :param path: files to import. :type path: str or list of str :param key_names: The name(s) of fields to be considered keys :type key_names: str or list of str :param npartitions: Number of partitions. :type npartitions: int or None :param config: Configuration options for importing text files :type config: :class:`.TextTableConfig` or None :rtype: :class:`.KeyTable` """ path_args = [] if isinstance(path, str): path_args.append(path) else: for p in path: path_args.append(p) if not isinstance(key_names, str): key_names = ','.join(key_names) if not npartitions: npartitions = self.sc.defaultMinPartitions if not config: config = TextTableConfig() return KeyTable( self, self._hail.keytable.KeyTable.importTextTable( self._jsc, jarray(self._jvm.java.lang.String, path_args), key_names, npartitions, config._to_java()))
def balding_nichols_model(self, populations, samples, variants, partitions=None, pop_dist=None, fst=None, af_dist=UniformDist(0.1, 0.9), seed=0): """Generate a VariantDataset using the Balding-Nichols model. **Examples** To generate a VDS with 3 populations, 100 samples in total, and 1000 variants: >>> vds = hc.balding_nichols_model(3, 100, 1000) To generate a VDS with 4 populations, 2000 samples, 5000 variants, 10 partitions, population distribution [0.1, 0.2, 0.3, 0.4], :math:`F_st` values [.02, .06, .04, .12], ancestral allele frequencies drawn from a truncated beta distribution with a = .01 and b = .05 over the interval [0.05, 1], and random seed 1: >>> vds = hc.balding_nichols_model(4, 40, 150, 10, >>> pop_dist=[0.1, 0.2, 0.3, 0.4], >>> fst=[.02, .06, .04, .12], >>> af_dist=TruncatedBetaDist(a=0.01, b=2.0, minVal=0.05, maxVal=1.0), >>> seed=1) **Notes** Hail is able to randomly generate a VDS using the Balding-Nichols model. - :math:`K` populations are labeled by integers 0, 1, ..., K - 1 - :math:`N` samples are named by strings 0, 1, ..., N - 1 - :math:`M` variants are defined as ``1:1:A:C``, ``1:2:A:C``, ..., ``1:M:A:C`` - The default ancestral frequency distribution :math:`P_0` is uniform on [0.1, 0.9]. Options are UniformDist(minVal, maxVal), BetaDist(a, b), and TruncatedBetaDist(a, b, minVal, maxVal) - The population distribution :math:`\pi` defaults to uniform - The :math:`F_{st}` values default to 0.1 - The number of partitions defaults to one partition per million genotypes (i.e., samples * variants / 10^6) or 8, whichever is larger The Balding-Nichols model models genotypes of individuals from a structured population comprising :math:`K` homogeneous subpopulations that have each diverged from a single ancestral population (a `star phylogeny`). We take :math:`N` samples and :math:`M` bi-allelic variants in perfect linkage equilibrium. The relative sizes of the subpopulations are given by a probability vector :math:`\pi`; the ancestral allele frequencies are drawn independently from a frequency spectrum :math:`P_0`; the subpopulations have diverged with possibly different :math:`F_{ST}` parameters :math:`F_k` (here and below, lowercase indices run over a range bounded by the corresponding uppercase parameter, e.g. :math:`k = 1, \ldots, K`). For each variant, the subpopulation allele frequencies are drawn a `beta distribution <https://en.wikipedia.org/wiki/Beta_distribution>`_, a useful continuous approximation of the effect of genetic drift. We denote the individual subpopulation memberships by :math:`k_n`, the ancestral allele frequences by :math:`p_{0, m}`, the subpopulation allele frequencies by :math:`p_{k, m}`, and the genotypes by :math:`g_{n, m}`. The generative model in then given by: .. math:: k_n \,&\sim\, \pi p_{0,m}\,&\sim\, P_0 p_{k,m}\mid p_{0,m}\,&\sim\, \mathrm{Beta}(\mu = p_{0,m},\, \sigma^2 = F_k p_{0,m}(1 - p_{0,m})) g_{n,m}\mid k_n, p_{k, m} \,&\sim\, \mathrm{Binomial}(2, p_{k_n, m}) We have parametrized the beta distribution by its mean and variance; the usual parameters are :math:`a = (1 - p)(1 - F)/F,\; b = p(1-F)/F` with :math:`F = F_k,\; p = p_{0,m}`. **Annotations** :py:meth:`~hail.HailContext.balding_nichols_model` adds the following global, sample, and variant annotations: - **global.nPops** (*Int*) -- Number of populations - **global.nSamples** (*Int*) -- Number of samples - **global.nVariants** (*Int*) -- Number of variants - **global.popDist** (*Array[Double]*) -- Normalized population distribution indexed by population - **global.Fst** (*Array[Double]*) -- F_st values indexed by population - **global.seed** (*Int*) -- Random seed - **global.ancestralAFDist** (*Struct*) -- Information about ancestral allele frequency distribution - **sa.pop** (*Int*) -- Population of sample - **va.ancestralAF** (*Double*) -- Ancestral allele frequency - **va.AF** (*Array[Double]*) -- Allele frequency indexed by population :param int populations: Number of populations. :param int samples: Number of samples. :param int variants: Number of variants. :param int partitions: Number of partitions. :param pop_dist: Unnormalized population distribution :type pop_dist: array of float or None :param fst: F_st values :type fst: array of float or None :param af_dist: Ancestral allele frequency distribution :type af_dist: :class:`.UniformDist` or :class:`.BetaDist` or :class:`.TruncatedBetaDist` :param int seed: Random seed. :rtype: :class:`.VariantDataset` :return: A VariantDataset generated by the Balding-Nichols model. """ if pop_dist is None: jvm_pop_dist_opt = joption(pop_dist) else: jvm_pop_dist_opt = joption(jarray(self._jvm.double, pop_dist)) if fst is None: jvm_fst_opt = joption(fst) else: jvm_fst_opt = joption(jarray(self._jvm.double, fst)) return VariantDataset( self, self._hail.stats.BaldingNicholsModel.apply(self._jsc, populations, samples, variants, jvm_pop_dist_opt, jvm_fst_opt, seed, joption(partitions), af_dist._jrep()))