Ejemplo n.º 1
0
class ImportanceAnalysis(object):
    """ Model for random forest based importance analysis
    """

    def __init__(self, _jia, sql):
        self._jia = _jia
        self.sql = sql

    @params(self=object, limit=Nullable(int))
    def important_variables(self, limit=10):
        """ Gets the top limit important variables as a list of tuples (name, importance) where:
            - name: string - variable name
            - importance: double - gini importance
        """
        jimpvarmap = self._jia.importantVariablesJavaMap(limit)
        return sorted(jimpvarmap.items(), key=lambda x: x[1], reverse=True)

    def oob_error(self):
        """ OOB (Out of Bag) error estimate for the model

        :rtype: float
        """
        return self._jia.oobError()

    def variable_importance(self):
        """ Returns a DataFrame with the gini importance of variables.

        The DataFrame has two columns:
        - variable: string - variable name
        - importance: double - gini importance
        """
        jdf = self._jia.variableImportance()
        jdf.count()
        jdf.createTempView("df")
        return self.sql.table("df")
Ejemplo n.º 2
0
class ImportanceAnalysis(object):
    """ Model for random forest based importance analysis
    """
    def __init__(self, hc, _jia):
        self.hc = hc
        self._jia = _jia

    @property
    def oob_error(self):
        """ OOB (Out of Bag) error estimate for the model

        :rtype: float
        """
        return self._jia.oobError()


    @params(self=object, n_limit=Nullable(int))
    def important_variants(self, n_limit=1000):
        """ Gets the top n most important loci.

        :param int n_limit: the limit of the number of loci to return

        :return: A KeyTable with the variant in the first column and importance in the second.
        :rtype: :py:class:`hail.KeyTable`
        """
        return KeyTable(self.hc, self._jia.variantImportance(n_limit))
Ejemplo n.º 3
0
class ParameterDoc(object):
    """The documentation data of a parameter or return value for an Eluna method."""

    # The integer ranges that each C++ type is valid for. None means valid for all numbers.
    valid_ranges = {
        'float': None,
        'double': None,
        'int': (
            '-2,147,483,647', '2,147,483,647'
        ),  # This should be -32767..32767, but it's pretty safe to assume 32-bit.
        'int8': ('-127', '127'),
        'uint8': ('0', '255'),
        'int16': ('-32,767', '32,767'),
        'uint16': ('0', '65,535'),
        'int32': ('-2,147,483,647', '2,147,483,647'),
        'uint32': ('0', '4,294,967,295'),
    }

    @params(self=object,
            name=unicode,
            data_type=str,
            description=unicode,
            default_value=Nullable(unicode))
    def __init__(self, name, data_type, description, default_value=None):
        """If `name` is not provided, the Parameter is a returned value instead of a parameter."""
        self.name = name
        self.data_type = data_type
        self.default_value = default_value

        if description:
            # Capitalize the first letter, add a period, and parse as Markdown.
            self.description = '{}{}. '.format(description[0].capitalize(),
                                               description[1:])
            self.description = markdown.markdown(self.description)
        else:
            self.description = ''

        # If the data type is a C++ number, convert to Lua number and add range info to description.
        if self.data_type in [
                'float', 'double', 'int8', 'uint8', 'int16', 'uint16', 'int32',
                'uint32'
        ]:
            range = ParameterDoc.valid_ranges[self.data_type]
            if range:
                self.description += '<p><em>Valid numbers</em>: integers from {0} to {1}.</p>'.format(
                    range[0], range[1])
            else:
                self.description += '<p><em>Valid numbers</em>: all decimal numbers.</p>'

            self.data_type = 'number'

        elif self.data_type == 'bool':
            self.data_type = 'boolean'

        elif self.data_type == 'uint64' or self.data_type == 'int64':
            self.data_type = 'string'
Ejemplo n.º 4
0
    def test_returns_nullable(self):
        @returns(Nullable(int))
        def foo(x):
            return x

        # should not raise anything
        foo(1)
        foo(None)

        self.assertRaises(TypeError, lambda: foo('a'))
Ejemplo n.º 5
0
    def test_params_nullable_type(self):
        @params(a=Nullable(int))
        def foo(a=None):
            pass

        # should not raise anything
        foo(0)
        foo(None)

        self.assertRaises(TypeError, lambda: foo('a'))
Ejemplo n.º 6
0
class FeatureSource(object):

    def __init__(self, _jvm, _vs_api, _jsql, sql, _jfs):
        self._jfs = _jfs
        self._jvm = _jvm
        self._vs_api = _vs_api
        self._jsql = _jsql
        self.sql = sql

    @params(self=object, label_source=object, n_trees=Nullable(int), mtry_fraction=Nullable(float),
            oob=Nullable(bool), seed=Nullable(Union(int, long)), batch_size=Nullable(int),
            var_ordinal_levels=Nullable(int), max_depth=int, min_node_size=int)
    def importance_analysis(self, label_source, n_trees=1000, mtry_fraction=None,
                            oob=True, seed=None, batch_size=100, var_ordinal_levels=3,
                            max_depth=java.MAX_INT, min_node_size=1):
        """Builds random forest classifier.

        :param label_source: The ingested label source
        :param int n_trees: The number of trees to build in the forest.
        :param float mtry_fraction: The fraction of variables to try at each split.
        :param bool oob: Should OOB error be calculated.
        :param long seed: Random seed to use.
        :param int batch_size: The number of trees to build in one batch.
        :param int var_ordinal_levels:

        :return: Importance analysis model.
        :rtype: :py:class:`ImportanceAnalysis`
        """
        jrf_params = self._jvm.au.csiro.variantspark.algo.RandomForestParams(bool(oob),
                                java.jfloat_or(mtry_fraction), True, java.NAN, True,
                                java.jlong_or(seed, randint(java.MIN_LONG, java.MAX_LONG)),
                                max_depth, min_node_size, False, 0)
        jia = self._vs_api.ImportanceAnalysis(self._jsql, self._jfs, label_source,
                                              jrf_params, n_trees, batch_size, var_ordinal_levels)
        return ImportanceAnalysis(jia, self.sql)
Ejemplo n.º 7
0
class VariantsDatasetFunctions(object):
    """Extension to hail.VariantDataset with variant-spark related functions
    """
    def __init__(self, *args, **kwargs):
        # check that the VariantDataset fields we rely on
        # have been initialized
        self.hc = self.hc
        self._jvds = self._jvds
        # Create the Java bridge object
        vsh = getattr(self.hc._jvm, 'au.csiro.variantspark.hail')
        self._vshf_cache = vsh.VSHailFunctions(self._jvds)

    @params(self=object,
            y_expr=str,
            n_trees=Nullable(int),
            mtry_fraction=Nullable(float),
            oob=Nullable(bool),
            seed=Nullable(Union(int, long)),
            batch_size=Nullable(int))
    def importance_analysis(self,
                            y_expr,
                            n_trees=1000,
                            mtry_fraction=None,
                            oob=True,
                            seed=None,
                            batch_size=100):
        """Builds random forest classifier for the response variable defined with y_expr.

        :param str y_expr: Response expression.  Must evaluate to Boolean or
                numeric with all values 0 or 1.
        :param int n_trees: The number of trees to build in the forest.
        :param float mtry_fraction: The fraction of variables to try at each split.
        :param bool oob: Should OOB error be calculated.
        :param long seed: Random seed to use.
        :param int batch_size: The number of trees to build in one batch.

        :return: Importance analysis model.
        :rtype: :py:class:`ImportanceAnalysis`
        """
        return ImportanceAnalysis(
            self.hc,
            self._vshf_cache.importanceAnalysis(
                y_expr, n_trees, joption(mtry_fraction), oob,
                joption(long(seed) if seed is not None else None), batch_size))

    @params(self=object, operation_name=str)
    def pairwise_operation(self, operation_name):
        """Computes a pairwise operation on encoded genotypes. Currently implemented operations
        include:

        - `manhattan` : the Manhattan distance
        - `euclidean` : the Euclidean distance
        - `sharedAltAlleleCount`: count of shared alternative alleles
        - `anySharedAltAlleleCount`: count of variants that share at least one alternative allele

        :param operation_name: name of the operaiton. One of `manhattan`, `euclidean`,
                `sharedAltAlleleCount`, `anySharedAltAlleleCount`

        :return: A symmetric `no_of_samples x no_of_samples` matrix with the result of
                the pairwise computation.
        :rtype: :py:class:`hail.KinshipMatrix`
        """
        return KinshipMatrix(
            self._vshf_cache.pairwiseOperation(operation_name))
Ejemplo n.º 8
0
class ClassParser(object):
    """Parses a file line-by-line and returns methods when enough information is received to build them."""

    # Various regular expressions to parse different parts of the doc string.
    # There are used to parse the class's description.
    class_start_regex = re.compile(
        r"\s*/\*\*\*")  # The start of class documentation, i.e. /***
    class_body_regex = re.compile(
        r"\s*\*\s*(.*)"
    )  # The "body", i.e. a * and optionally some descriptive text.
    class_end_regex = re.compile(
        r"\s*\*/")  # The end of the comment portion, i.e. */

    # These are used to parse method documentation.
    start_regex = re.compile(
        r"\s*/\*\*")  # The start of documentation, i.e. /**
    body_regex = re.compile(
        r"\s*\s?\*\s*(.*)"
    )  # The "body", i.e. a * and optionally some descriptive text.
    # An extra optional space (\s?) was thrown in to make it different from `class_body_regex`.

    param_regex = re.compile(
        r"""\s*\*\s@param\s  # The @param tag starts with opt. whitespace followed by "* @param ".
                                 ([&\w]+)\s(\w+)  # The data type, a space, and the name of the param.
                                 (?:\s=\s(\w+))?  # The default value: a = surrounded by spaces, followed by text.
                                 (?:\s:\s(.+))?   # The description: a colon surrounded by spaces, followed by text.
                                 """, re.X)
    # This is the same as the @param tag, minus the default value part.
    return_regex = re.compile(
        r"""\s*\*\s@return\s
                                  ([&\w]+)\s(\w+)
                                  (?:\s:\s(.+))?
                                  """, re.X)

    comment_end_regex = re.compile(
        r"\s*\*/")  # The end of the comment portion, i.e. */
    end_regex = re.compile(
        r"\s*int\s(\w+)\s*\("
    )  # The end of the documentation, i.e. int MethodName(

    def __init__(self, class_name):
        assert ClassParser.class_body_regex is not ClassParser.body_regex
        # The methods that have been parsed.
        self.methods = []
        # The name of the class being parsed.
        self.class_name = class_name
        # The description of the class being parsed.
        self.class_description = ''
        # Reset the parser's state machine.
        self.reset()

    def reset(self):
        # What the last handled regex was, to determine what the next should be.
        self.last_regex = None

        # These are used to piece together the next `Method`.
        self.description = ''
        self.params = []
        self.returned = []
        self.method_name = None

    def handle_class_body(self, match):
        text = match.group(1)
        self.class_description += text + '\n'

    def handle_body(self, match):
        text = match.group(1)
        self.description += text + '\n'

    def handle_param(self, match):
        data_type, name, default, description = match.group(1), match.group(
            2), match.group(3), match.group(4)
        self.params.append(ParameterDoc(name, data_type, description, default))

    def handle_return(self, match):
        data_type, name, description = match.group(1), match.group(
            2), match.group(3)
        self.returned.append(ParameterDoc(name, data_type, description))

    def handle_end(self, match):
        self.method_name = match.group(1)
        self.methods.append(
            MethodDoc(self.method_name, self.description, self.params,
                      self.returned))

    # Table of which handler is used to handle each regular expressions.
    regex_handlers = {
        class_start_regex: None,
        class_body_regex: handle_class_body,
        class_end_regex: None,
        start_regex: None,
        body_regex: handle_body,
        param_regex: handle_param,
        return_regex: handle_return,
        comment_end_regex: None,
        end_regex: handle_end,
    }

    # Table of which regular expressions can follow the last handled regex.
    # `doc_body_regex` must always come LAST when used, since it also matches param, return, and comment_end.
    next_regexes = {
        None: [class_start_regex, start_regex, end_regex],
        class_start_regex: [class_end_regex, class_body_regex],
        class_body_regex: [class_end_regex, class_body_regex],
        class_end_regex: [],
        start_regex:
        [param_regex, return_regex, comment_end_regex, body_regex],
        body_regex: [param_regex, return_regex, comment_end_regex, body_regex],
        param_regex: [param_regex, return_regex, comment_end_regex],
        return_regex: [return_regex, comment_end_regex],
        comment_end_regex: [end_regex],
        end_regex: [],
    }

    @returns(Nullable(MethodDoc))
    @params(self=object, line=str)
    def next_line(self, line):
        """Parse the next line of the file.

        This method returns a `Method` when enough data to form a `Method` has been parsed.
        Otherwise, it returns None.
        """
        # Get the list of expected regular expressions using the last one handled.
        valid_regexes = self.next_regexes[self.last_regex]

        # Try to find a match.
        for regex in valid_regexes:
            match = regex.match(line)

            if match:
                handler = self.regex_handlers[regex]

                if handler:
                    handler(self, match)

                # Not every regex has a handler, but keep track of where we are anyway.
                self.last_regex = regex
                # Break at the first match.
                break
        else:
            # No valid regex was found, reset everything.
            self.reset()

    @returns(MangosClassDoc)
    def to_class_doc(self):
        """Create an instance of `MangosClassDoc` from the parser's data.

        Is called by `parse_file` once parsing is finished.
        """
        return MangosClassDoc(self.class_name, self.class_description,
                              self.methods)

    @staticmethod
    @returns(MangosClassDoc)
    @params(file=FileType)
    def parse_file(file):
        """Parse the file `file` into a documented class."""
        # Get the class name from "ClassMethods.h" by stripping off "Methods.h".
        class_name = file.name[:-len('Methods.h')]
        parser = ClassParser(class_name)

        line = file.readline()

        while line:
            parser.next_line(line)
            line = file.readline()

        return parser.to_class_doc()
Ejemplo n.º 9
0
class ClassParser(object):
    """Parses a file line-by-line and returns methods when enough information is received to build them."""

    # Various regular expressions to parse different parts of the doc string.
    # There are used to parse the class's description.
    class_start_regex = re.compile(
        r"\s*/\*\*\*")  # The start of class documentation, i.e. /***
    class_body_regex = re.compile(
        r"\s*\*\s*(.*)"
    )  # The "body", i.e. a * and optionally some descriptive text.
    class_end_regex = re.compile(
        r"\s*\*/")  # The end of the comment portion, i.e. */

    # These are used to parse method documentation.
    start_regex = re.compile(
        r"\s*/\*\*")  # The start of documentation, i.e. /**
    body_regex = re.compile(
        r"\s*\s?\*\s?(.*)"
    )  # The "body", i.e. a * and optionally some descriptive text.
    # An extra optional space (\s?) was thrown in to make it different from `class_body_regex`.

    param_regex = re.compile(
        r"""\s*\*\s@param\s    # The @param tag starts with opt. whitespace followed by "* @param ".
                                 ([^\s]+)\s(\w+)?   # The data type, a space, and the name of the param.
                                 (?:\s=\s(\w+))?    # The default value: a = surrounded by spaces, followed by text.
                                 (?:\s:\s(.+))?     # The description: a colon surrounded by spaces, followed by text.
                                 """, re.X)
    # This is the same as the @param tag, minus the default value part.
    return_regex = re.compile(
        r"""\s*\*\s@return\s
                                  ([\[\]\w]+)\s(\w+)
                                  (?:\s:\s(.+))?
                                  """, re.X)
    proto_regex = re.compile(
        r"""\s*\*\s@proto\s
                                 ([\w\s,]+)?          # The list of arguments.
                                 (?:=\s)?             # An equals sign and a space separate the args and returns.
                                 (?:\(([\w\s,]+)\))?  # The list of return values, in parens.
                                 """, re.X)

    comment_end_regex = re.compile(
        r"\s*\*/")  # The end of the comment portion, i.e. */
    end_regex = re.compile(
        r"\s*int\s(\w+)\s*\("
    )  # The end of the documentation, i.e. int MethodName(

    def __init__(self, class_name):
        assert ClassParser.class_body_regex is not ClassParser.body_regex
        # The methods that have been parsed.
        self.methods = []
        # The name of the class being parsed.
        self.class_name = class_name
        # The description of the class being parsed.
        self.class_description = ''
        # Reset the parser's state machine.
        self.reset()

    def reset(self):
        # What the last handled regex was, to determine what the next should be.
        self.last_regex = None

        # These are used to piece together the next `Method`.
        self.description = ''
        self.params = []
        self.returned = []
        self.method_name = None
        self.prototypes = []

    def handle_class_body(self, match):
        text = match.group(1)
        self.class_description += text + '\n'

    def handle_body(self, match):
        text = match.group(1)
        self.description += text + '\n'

    def handle_param(self, match):
        data_type, name, default, description = match.group(1), match.group(
            2), match.group(3), match.group(4)
        self.params.append(ParameterDoc(name, data_type, description, default))

    def handle_return(self, match):
        data_type, name, description = match.group(1), match.group(
            2), match.group(3)
        self.returned.append(ParameterDoc(name, data_type, description))

    def handle_proto(self, match):
        return_values, parameters = match.group(1), match.group(2)
        parameters = ' ' + parameters + ' ' if parameters else ''
        return_values = return_values + '= ' if return_values else ''

        if self.class_name == 'Global':
            prototype = '{0}{{0}}({1})'.format(return_values, parameters)
        else:
            prototype = '{0}{1}:{{0}}({2})'.format(return_values,
                                                   self.class_name, parameters)

        self.prototypes.append(prototype)

    def handle_end(self, match):
        self.method_name = match.group(1)

        def make_prototype(parameters):
            if parameters != '':
                parameters = ' ' + parameters + ' '

            if self.class_name == 'Global':
                if self.returned:
                    return_values = ', '.join(
                        [param.name for param in self.returned])
                    prototype = '{0} = {1}({2})'.format(
                        return_values, self.method_name, parameters)
                else:
                    prototype = '{0}({1})'.format(self.method_name, parameters)
            else:
                if self.returned:
                    return_values = ', '.join(
                        [param.name for param in self.returned])
                    prototype = '{0} = {1}:{2}({3})'.format(
                        return_values, self.class_name, self.method_name,
                        parameters)
                else:
                    prototype = '{0}:{1}({2})'.format(self.class_name,
                                                      self.method_name,
                                                      parameters)

            return prototype

        # If there's no prototype, make one with all params and returns.
        if not self.prototypes:
            # A list of all parameters with default values.
            params_with_default = []
            # The index of the last non-default parameter.
            last_non_default_i = 0
            # If False, a parameter WITHOUT a default value follows one WITH a default value.
            # In this case, don't bother generating prototypes.
            simple_order = True

            for i, param in enumerate(self.params):
                if param.default_value:
                    params_with_default.append(param)
                else:
                    last_non_default_i = i
                    if params_with_default:
                        simple_order = False

            if not params_with_default or not simple_order:
                # Just generate one prototype with all the parameters.
                parameters = ', '.join([param.name for param in self.params])
                self.prototypes.append(make_prototype(parameters))
            else:
                # Generate a prototype for all the non-default parameters,
                #   then one for each default parameter with all the previous parameters.
                for i in range(last_non_default_i, len(self.params)):
                    parameters = ', '.join(
                        [param.name for param in self.params[:i + 1]])
                    self.prototypes.append(make_prototype(parameters))

        else:
            # Format the method name into each prototype.
            self.prototypes = [
                proto.format(self.method_name) for proto in self.prototypes
            ]

        self.methods.append(
            MethodDoc(self.method_name, self.description, self.prototypes,
                      self.params, self.returned))

    # Table of which handler is used to handle each regular expressions.
    regex_handlers = {
        class_start_regex: None,
        class_body_regex: handle_class_body,
        class_end_regex: None,
        start_regex: None,
        body_regex: handle_body,
        param_regex: handle_param,
        return_regex: handle_return,
        proto_regex: handle_proto,
        comment_end_regex: None,
        end_regex: handle_end,
    }

    # Table of which regular expressions can follow the last handled regex.
    # `body_regex` must always come LAST when used, since it also matches param, return, and comment_end.
    next_regexes = {
        None: [class_start_regex, start_regex, end_regex],
        class_start_regex: [class_end_regex, class_body_regex],
        class_body_regex: [class_end_regex, class_body_regex],
        class_end_regex: [],
        start_regex: [
            param_regex, return_regex, proto_regex, comment_end_regex,
            body_regex
        ],
        body_regex: [
            param_regex, return_regex, proto_regex, comment_end_regex,
            body_regex
        ],
        proto_regex: [
            param_regex, return_regex, proto_regex, comment_end_regex,
            body_regex
        ],
        param_regex:
        [param_regex, return_regex, comment_end_regex, body_regex],
        return_regex: [return_regex, comment_end_regex],
        comment_end_regex: [end_regex],
        end_regex: [],
    }

    @returns(Nullable(MethodDoc))
    @params(self=object, line=str)
    def next_line(self, line):
        """Parse the next line of the file.

        This method returns a `Method` when enough data to form a `Method` has been parsed.
        Otherwise, it returns None.
        """
        # Get the list of expected regular expressions using the last one handled.
        valid_regexes = self.next_regexes[self.last_regex]

        # Try to find a match.
        for regex in valid_regexes:
            match = regex.match(line)

            if match:
                handler = self.regex_handlers[regex]

                if handler:
                    handler(self, match)

                # Not every regex has a handler, but keep track of where we are anyway.
                self.last_regex = regex
                # Break at the first match.
                break
        else:
            # No valid regex was found, reset everything.
            self.reset()

    @returns(MangosClassDoc)
    def to_class_doc(self):
        """Create an instance of `MangosClassDoc` from the parser's data.

        Is called by `parse_file` once parsing is finished.
        """
        return MangosClassDoc(self.class_name, self.class_description,
                              self.methods)

    @staticmethod
    @returns(MangosClassDoc)
    @params(file=FileType)
    def parse_file(file):
        """Parse the file `file` into a documented class."""
        # Get the class name from "ClassMethods.h" by stripping off "Methods.h".
        class_name = file.name[:-len('Methods.h')]
        parser = ClassParser(class_name)

        line = file.readline()

        while line:
            parser.next_line(line)
            line = file.readline()

        return parser.to_class_doc()
Ejemplo n.º 10
0
class ParameterDoc(object):
    """The documentation data of a parameter or return value for an Eluna method."""

    # The integer ranges that each C++ type is valid for. None means valid for all numbers.
    valid_ranges = {
        'float': None,
        'double': None,
        'int': (
            '-2,147,483,647', '2,147,483,647'
        ),  # This should be -32767..32767, but it's pretty safe to assume 32-bit.
        'int8': ('-127', '127'),
        'uint8': ('0', '255'),
        'int16': ('-32,767', '32,767'),
        'uint16': ('0', '65,535'),
        'int32': ('-2,147,483,647', '2,147,483,647'),
        'uint32': ('0', '4,294,967,295'),
        'int64': ('-9,223,372,036,854,775,808', '9,223,372,036,854,775,807'),
        'uint64': ('0', '18,446,744,073,709,551,615'),
        'ObjectGuid': ('0', '18,446,744,073,709,551,615'),
    }

    @params(self=object,
            name=Nullable(unicode),
            data_type=str,
            description=unicode,
            default_value=Nullable(unicode))
    def __init__(self, name, data_type, description, default_value=None):
        """If `name` is not provided, the Parameter is a returned value instead of a parameter."""
        self.name = name
        self.data_type = data_type
        self.default_value = default_value

        if self.data_type == '...':
            self.name = '...'
        else:
            assert (self.name is not None)

        if description:
            # Capitalize the first letter, add a period, and parse as Markdown.
            self.description = '{}{}. '.format(description[0].capitalize(),
                                               description[1:])
            self.description = markdown.markdown(self.description)
        else:
            self.description = ''

        # If the data type is a C++ number, convert to Lua number and add range info to description.
        if self.data_type in self.valid_ranges.keys():
            range = ParameterDoc.valid_ranges[self.data_type]
            if range:
                self.description += '<p><em>Valid numbers</em>: integers from {0} to {1}.</p>'.format(
                    range[0], range[1])
            else:
                self.description += '<p><em>Valid numbers</em>: all decimal numbers.</p>'

            self.data_type = 'number'

        elif self.data_type == 'bool':
            self.data_type = 'boolean'

        elif self.data_type == 'int64' or self.data_type == 'uint64':
            self.data_type = '[' + self.data_type + ']'

        elif not self.data_type in [
                'nil', 'boolean', 'number', 'string', 'table', 'function',
                '...'
        ] and self.data_type[:1] != '[':
            print "Missing angle brackets [] around the data type name: `" + self.data_type + "`"