Ejemplo n.º 1
0
    def dataframe_to_keytable(self, df, keys=[]):
        """Convert Spark SQL DataFrame to key table.

        Spark SQL data types are converted to Hail types in the obvious way as follows:

        .. code-block:: text

          BooleanType => Boolean
          IntegerType => Int
          LongType => Long
          FloatType => Float
          DoubleType => Double
          StringType => String
          BinaryType => Binary
          ArrayType => Array
          StructType => Struct

        Unlisted Spark SQL data types are currently unsupported.

        :param keys: List of key column names.
        :type keys: list of string

        :return: Key table constructed from the Spark SQL DataFrame.
        :rtype: :class:`.KeyTable`
        """

        jkeys = jarray(self._jvm.java.lang.String, keys)
        return KeyTable(
            self, self._hail.keytable.KeyTable.fromDF(self._jhc, df._jdf,
                                                      jkeys))
Ejemplo n.º 2
0
    def import_keytable(self,
                        path,
                        npartitions=None,
                        config=TextTableConfig()):
        """Import delimited text file (text table) as key table.

        The resulting key table will have no key columns, use :py:meth:`.KeyTable.key_by`
        to specify keys.

        :param path: files to import.
        :type path: str or list of str

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig`

        :return: Key table constructed from text table.
        :rtype: :class:`.KeyTable`
        """

        if not config:
            config = TextTableConfig()

        jkt = self._jhc.importKeyTable(jindexed_seq_args(path),
                                       joption(npartitions), config._to_java())
        return KeyTable(self, jkt)
Ejemplo n.º 3
0
    def import_keytable(self,
                        path,
                        key_names=[],
                        npartitions=None,
                        config=TextTableConfig()):
        """Import delimited text file (text table) as KeyTable.

        :param path: files to import.
        :type path: str or list of str

        :param key_names: The name(s) of fields to be considered keys
        :type key_names: str or list of str

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig`

        :rtype: :class:`.KeyTable`
        """

        if not config:
            config = TextTableConfig()

        jkt = self._jhc.importKeyTable(jindexed_seq_args(path),
                                       jindexed_seq_args(key_names),
                                       joption(npartitions), config._to_java())
        return KeyTable(self, jkt)
Ejemplo n.º 4
0
    def read_table(self, path):
        """Read a KT file as key table.

        :param str path: KT file to read.

        :return: Key table read from disk.
        :rtype: :class:`.KeyTable`
        """

        jkt = self._jhc.readTable(path)
        return KeyTable(self, jkt)
Ejemplo n.º 5
0
    def import_keytable(self, path, key_names, npartitions=None, config=None):
        """Import delimited text file (text table) as KeyTable.

        :param path: files to import.
        :type path: str or list of str

        :param key_names: The name(s) of fields to be considered keys
        :type key_names: str or list of str

        :param npartitions: Number of partitions.
        :type npartitions: int or None

        :param config: Configuration options for importing text files
        :type config: :class:`.TextTableConfig` or None

        :rtype: :class:`.KeyTable`
        """

        path_args = []
        if isinstance(path, str):
            path_args.append(path)
        else:
            for p in path:
                path_args.append(p)

        if not isinstance(key_names, str):
            key_names = ','.join(key_names)

        if not npartitions:
            npartitions = self.sc.defaultMinPartitions

        if not config:
            config = TextTableConfig()

        return KeyTable(
            self,
            self._hail.keytable.KeyTable.importTextTable(
                self._jsc, jarray(self._jvm.java.lang.String, path_args),
                key_names, npartitions, config._to_java()))
Ejemplo n.º 6
0
    def import_table(self, paths, key=[], min_partitions=None, impute=False, no_header=False,
                     comment=None, delimiter="\t", missing="NA", types={}, quote=None):
        """Import delimited text file (text table) as key table.

        The resulting key table will have no key columns, use :py:meth:`.KeyTable.key_by`
        to specify keys.
        
        **Example**
    
        Given this file

        .. code-block:: text

            $ cat data/samples1.tsv
            Sample	Height	Status  Age
            PT-1234	154.1	ADHD	24
            PT-1236	160.9	Control	19
            PT-1238	NA	ADHD	89
            PT-1239	170.3	Control	55

        The interesting thing about this table is that column ``Height`` is a floating-point number, 
        and column ``Age`` is an integer. We can either provide have Hail impute these types from 
        the file, or pass them ourselves:
        
        Pass the types ourselves:
        
        >>> table = hc.import_table('data/samples1.tsv', types={'Height': TFloat64(), 'Age': TInt32()})
        
        Note that string columns like ``Sample`` and ``Status`` do not need to be typed, because ``String``
        is the default type.
        
        Use type imputation (a bit easier, but requires reading the file twice):
        
        >>> table = hc.import_table('data/samples1.tsv', impute=True)

        **Detailed examples**

        Let's import annotations from a CSV file with missing data and special characters:

        .. code-block:: text

            $ cat data/samples2.tsv
            Batch,PT-ID
            1kg,PT-0001
            1kg,PT-0002
            study1,PT-0003
            study3,PT-0003
            .,PT-0004
            1kg,PT-0005
            .,PT-0006
            1kg,PT-0007

        In this case, we should:

        - Pass the non-default delimiter ``,``

        - Pass the non-default missing value ``.``

        >>> table = hc.import_table('data/samples2.tsv', delimiter=',', missing='.')

        Let's import annotations from a file with no header and sample IDs that need to be transformed. 
        Suppose the vds sample IDs are of the form ``NA#####``. This file has no header line, and the 
        sample ID is hidden in a field with other information.

        .. code-block: text

            $ cat data/samples3.tsv
            1kg_NA12345   female
            1kg_NA12346   male
            1kg_NA12348   female
            pgc_NA23415   male
            pgc_NA23418   male

        To import:

        >>> annotations = (hc.import_table('data/samples3.tsv', no_header=True)
        ...                   .annotate('sample = f0.split("_")[1]')
        ...                   .key_by('sample'))
        
        **Notes**
        
        The ``impute`` option tells Hail to scan the file an extra time to gather
        information about possible field types. While this is a bit slower for large files, (the 
        file is parsed twice), the convenience is often worth this cost.
        
        The ``delimiter`` parameter is a field separator regex. This regex follows the 
         `Java regex standard <http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html>`_.
        
        .. note::
        
            Use ``delimiter='\\s+'`` to specify whitespace delimited files.
            
        The ``comment`` is an optional parameter which causes Hail to skip any line that starts in the
        given pattern. Passing ``comment='#'`` will skip any line beginning in a pound sign, for example.
        
        The ``missing`` parameter defines the representation of missing data in the table. 
        
        .. note::
        
            The ``comment`` and ``missing`` parameters are **NOT** regexes.

        The ``no_header`` option indicates that the file has no header line. If this option is passed, 
        then the column names will be ``f0``, ``f1``, ... ``fN`` (0-indexed). 
        
        The ``types`` option allows the user to pass the types of columns in the table. This is a 
        dict keyed by ``str``, with :py:class:`~hail.expr.Type` values. See the examples above for
        a standard usage. Additionally, this option can be used to override type imputation. For example,
        if a column in a file refers to chromosome and does not contain any sex chromosomes, it will be
        imputed as an integer, while most Hail methods expect chromosome to be passed as a string. Using
        the ``impute=True`` mode and passing ``types={'Chromosome': TString()}`` will solve this problem.
        
        The ``min_partitions`` option can be used to increase the number of partitions (level of sharding)
        of an imported table. The default partition size depends on file system and a number of other 
        factors (including the ``min_block_size`` of the hail context), but usually is between 32M and 128M.
        
        :param paths: Files to import.
        :type paths: str or list of str

        :param key: Key column(s).
        :type key: str or list of str

        :param min_partitions: Minimum number of partitions.
        :type min_partitions: int or None

        :param bool no_header: File has no header and the N columns are named ``f0``, ``f1``, ... ``fN`` (0-indexed)
        
        :param bool impute: Impute column types from the file
        
        :param comment: Skip lines beginning with the given pattern
        :type comment: str or None
        
        :param str delimiter: Field delimiter regex
        
        :param str missing: Specify identifier to be treated as missing
        
        :param types: Define types of fields in annotations files   
        :type types: dict with str keys and :py:class:`.Type` values
    
        :return: Key table constructed from text table.
        :rtype: :class:`.KeyTable`

        :param quote: Quote character
        :type quote: str or None
        """

        key = wrap_to_list(key)
        paths = wrap_to_list(paths)
        jtypes = {k: v._jtype for k, v in types.items()}

        jkt = self._jhc.importTable(paths, key, min_partitions, jtypes, comment, delimiter, missing,
                                    no_header, impute, quote)
        return KeyTable(self, jkt)