Esempio n. 1
0
class TranspylerKernel(IPythonKernel):
    """
    A meta kernel based backend to use Transpyled languages in Jupyter/iPython.
    """

    transpyler = None
    implementation = lazy(lambda self: 'i' + self.transpyler.name)
    implementation_version = lazy(lambda self: self.transpyler.version)
    language = lazy(lambda self: self.transpyler.name)
    language_version = lazy(lambda self: self.transpyler.language_version)
    banner = lazy(lambda self: self.transpyler.console_banner())
    language_info = lazy(lambda self: self.transpyler.info.get_language_info())
    shell_class = Type(TranspylerShell)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.transpyler is None:
            raise ValueError('transpyler was not defined')
        monkey_patch(self.transpyler)
        self.transpyler.init()

    def do_execute(self, code, *args, **kwargs):
        code = self.transpyler.transpile(code)
        return super().do_execute(code, *args, **kwargs)

    def do_is_complete(self, code):
        return super().do_is_complete(self.transpyler.transpile(code))
Esempio n. 2
0
class Classifier:
    """
    Abstract classifier interface.
    """

    _label_set = lazy(lambda x: sorted(set(x.labels)))
    _labels_map = lazy(lambda x: {x: i for i, x in enumerate(x._label_set)})

    @lazy
    def _labels_encoder(self):
        sorted_items = sorted(self._labels_map.items(), key=lambda _: _[1])
        return [k for k, i in sorted_items]

    @lazy
    def _integer_labels(self):
        map = self._labels_map
        return np.array([map[label] for label in self.labels])

    def __init__(self, training_data, transformer, labels):
        self.training_data = np.asarray(training_data)
        self.labels = np.asarray(labels)
        self._transformer = transformer

    def __call__(self, pop):
        """
        self(pop) <==> self.classify(pop)
        """
        return self.classify(pop)

    def transform(self, pop):
        """
        Transform an individual or population to a raw np.ndarray data set.
        """
        if getattr(pop, 'is_individual', False):
            pop = population.Population([pop.data])
        return self._transformer(pop)

    def classify(self, pop):
        """
        Classify a population object.
        """
        is_individual = getattr(pop, 'is_individual', False)
        data = self.transform(pop)
        result = self.classify_data(data)
        return result[0] if is_individual else result

    def classify_data(self, data):
        """
        Classify raw data returning a list of labels.
        """
        raise NotImplementedError

    def prob_matrix(self, pop):
        """
        Return a matrix with the probability that each individual is classified
        with each label. Individuals are represented in the rows and labels
        in the columns.

        Label indexes are assigned by ordering, e.g., If the original labels
        contain 'foo', 'bar' and 'baz', 'bar' will be assigned a column index
        of 0  (because it is the first in alphabetical order), 'baz' will be the
        second column and 'foo' the third.
        """

        logp = self.log_prob_matrix(pop)
        logp -= logp.max()
        probs = np.exp(logp)
        probs /= probs.sum()
        return probs

    def prob_list(self, pop):
        """
        Return a list of :cls:`kpop.Prob` objects with the probabilities
        assigned to each label classification.
        """

        values = self._labels_encoder
        return [Prob(zip(values, row)) for row in self.prob_matrix(pop)]

    def prob_table(self, pop):
        """
        Return a pandas dataframe with the probabilities that each individual
        belongs to each label.
        """
        from pandas import DataFrame

        data = self.prob_matrix(pop)
        return DataFrame(data, columns=self._labels_encoder)

    def log_prob_matrix(self, pop):
        """
        Like :meth:`prob_matrix`, but returns the log probabilities.
        """
        if type(self).prob_matrix is not Classifier.prob_matrix:
            return np.log(self.prob_matrix(pop))

        raise NotImplementedError(
            "either 'log_prob_matrix' or 'prob_matrix' must be defined")

    def log_prob_list(self, pop):
        """
        Like :meth:`prob_list`, but returns the log probabilities.
        """
        values = self._labels_encoder
        return [
            Prob(zip(values, row), normalize=False)
            for row in self.log_prob_matrix(pop)
        ]

    def log_prob_table(self, pop):
        """
        Like :meth:`prob_table`, but returns the log probabilities.
        """
        from pandas import DataFrame

        data = self.log_prob_matrix(pop)
        return DataFrame(data, columns=self._labels_encoder)
Esempio n. 3
0
class PopulationBase(collections.Sequence, metaclass=abc.ABCMeta):
    """
    Base class for Population and MultiPopulation.

    Attrs:
        freqs:
            A list of :cls:`kpop.Prob` objects representing the probabilities
            of each loci.
        freqs_matrix:
            A full matrix with the shape (num individuals, max num of alleles)
            with the probability for each allele.
        freqs_vector:
            Frequencies for allele 1. This is more useful for biallelic data,
            since the frequency of the second allele is simply the complement.
        hfreqs_vector:
            Vector of frequencies of heterozygotes.
    """

    # General shape
    size = property(len)
    num_loci = lazy(lambda _: _[0].num_loci)
    ploidy = lazy(lambda _: _[0].ploidy)
    shape = property(lambda _: (_.size, _.num_loci, _.ploidy))
    data_size = fn_property(_.size * _.num_loci * _.ploidy)
    dtype = np.dtype('uint8')
    _shape_attrs = (
        'size',
        'num_loci',
        'ploidy',
        'shape',
        'data_size',
    )

    # Frequencies
    freqs = property(get_freqs, set_freqs)
    freqs_matrix = lazy(lambda _: freqs_to_matrix(_.freqs))
    freqs_vector = lazy(lambda _: np.ascontiguousarray(_.freqs_matrix[:, 0]))
    hfreqs_vector = lazy(hfreqs_vector)

    # Allele statistics
    allele_names = None
    is_biallelic = fn_lazy(_.num_alleles == 2)
    num_alleles = lazy(lambda _: max(max(freq) for freq in _.freqs))

    # Multi population
    is_multi_population = False
    num_populations = fn_property(lambda _: len(_.populations))

    # Missing data
    has_missing_data = property(lambda _: any(ind.has_missing for ind in _))
    missing_data_total = property(
        lambda _: sum(ind.missing_total for ind in _))
    missing_data_ratio = fn_property(_.missing_data_total / _.data_size)

    # Meta information
    individual_ids = property(lambda _: _.meta['ids'])

    # Special attributes. These will be inserted later via monkey patching
    populations = ()
    admixture = Admixture()
    clusterization = Clusterization()
    classification = Classification()
    io = Io()
    plot = Plot()
    projection = Projection()
    simulation = Simulation()
    statistics = Statistics()

    # Aliases
    admix = property(lambda self: self.admixture)
    cls = property(lambda self: self.classification)
    cluster = property(lambda self: self.clusterization)
    proj = property(lambda self: self.projection)
    sim = property(lambda self: self.simulation)
    stats = property(lambda self: self.statistics)

    # List of cacheable attributes
    _cacheable_attributes = (
        'has_missing',
        'missing_total',
        'missing_ratio',
        'is_biallelic',
        'num_alleles',
        'admixture',
        'clustering',
        'classification',
        'io',
        'plot',
        'projection',
        'simulation',
        'statistics',
    )

    def __init__(self,
                 freqs=None,
                 allele_names=None,
                 id=None,
                 ploidy=None,
                 num_loci=None,
                 num_alleles=None,
                 individual_ids=None):

        # Normalize frequencies
        if freqs is None:
            self._freqs = None
        elif len(freqs) == 0:
            raise ValueError('cannot initialize from empty frequencies')
        elif isinstance(freqs[0], collections.Mapping):
            self._freqs = [Prob(p) for p in freqs]
        else:
            freqs = np.asarray(freqs)

            if freqs.ndim == 2:
                self._freqs = [Prob(dict(enumerate(p, 1))) for p in freqs]
                self.freqs_matrix = np.array(freqs)
                self.num_alleles = freqs.shape[1]
            elif freqs.ndim == 1:
                self._freqs = [Prob({1: p, 2: 1 - p}) for p in freqs]
                self.freqs_vector = np.array(freqs)
                self.freqs_matrix = fill_freqs_vector(self.freqs_vector)
                self.num_alleles = 2
            else:
                raise ValueError('invalid frequency data')

        # Fix num_loci from data
        if self._freqs is not None:
            self.num_loci = len(self._freqs)
            if num_loci is not None and num_loci != self.num_loci:
                raise ValueError('invalid value for num_loci')
        elif num_loci is not None:
            self.num_loci = num_loci

        # Individual ids
        if individual_ids is None:
            fmt = 'ind%s' if id is None else '%s%%s' % id
            individual_ids = [fmt % i for i in range(1, self.size + 1)]

        # Save required attributes
        self.allele_names = allele_names
        self.id = id
        self._last_id_index = 0
        self.meta = pd.DataFrame({'ids': individual_ids})

        # Save optional given lazy attributes
        if ploidy is not None:
            self.ploidy = ploidy
        if num_alleles is not None:
            self.num_alleles = num_alleles

    def __repr__(self):
        return self.io.render(id_align='best', limit=20, ind_limit=10)

    def __str__(self):
        return self.io.render(id_align='best')

    def __eq__(self, other):
        if not isinstance(other, PopulationBase):
            return NotImplemented
        if self.shape != other.shape:
            return False
        return all(x == y for x, y in zip(self, other))

    def _population(self, *args, **kwargs):
        from kpop import Population
        return Population(*args, **kwargs)

    def _next_id(self):
        self._last_id_index += 1
        return '%s%s' % (self.id or 'ind', self._last_id_index)

    def _clear_caches(self):
        discard_attrs(self, self._cacheable_attributes)

    def _as_array(self):
        return NotImplementedError('must be implemented on subclasses')

    def as_array(self, which='raw'):
        """
        Convert to a numpy data array using the requested conversion method.
        This is a basic pre-processing step in many dimensionality reduction
        algorithms.

        Genotypes are categorical data and usually it doesn't make sense to
        treat the integer encoding used in kpop as ordinal data (there is
        no ordering implied when treating say, allele 1 vs allele 2 vs allele
        3).

        Conversion methods:
            * raw:
                An 3 dimensional array of (size, num_loci, ploidy) for raw
                genotype data. Each component represents the value of a single
                allele.
            * flat:
                Like raw, but flatten the last dimension into a (size,
                num_loci * ploidy) array. This creates a new feature per
                loci for each degree of ploidy in the data.
            * rflat:
                Flatten data, but first shuffle the positions of alleles at
                each loci. This is recommended if data does not carry reliable
                haplotype information.
            * raw-unity, flat-unity, rflat-unity:
                Normalized versions of "raw", "flat", and "rflat" methods. All
                components are rescaled with zero mean and unity variance.
            * count:
                Force conversion to biallelic data and counts the number of
                occurrences of the first allele. Most methdds will require
                normalization, so you probably should consider an specific
                method such as count-unity, count-snp, etc
            * count-unity:
                Normalized version of count scaled to zero mean and unity
                variance.
            * count-snp:
                Normalizes each feature using the standard deviation expected
                under the assumption of Hardy-Weinberg equilibrium. This
                procedure is described at Patterson et. al., "Population
                Structure and Eigenanalysis" and is recommended for SNPs
                subject to genetic drift.
            * count-center:
                Instead of normalizing, simply center data by subtracting half
                the ploidy to place it into a symmetric range. This
                normalization puts data into a cube with a predictable
                origin and range. For diploid data, the components will be
                either -1, 0, or 1.

        Returns:
            An ndarray with transformed data.
        """
        data = self._as_array()

        # Raw conversion
        if which == 'raw':
            return data
        elif which == 'raw-unity':
            data = data - data.mean(axis=0)
            std = data.std(axis=0)
            data /= np.where(std, std, 1)
            return data

        # Flattened representations
        elif which in {'flat', 'flat-unity'}:
            data = data.reshape(self.size, self.num_loci * self.ploidy)
            if which == 'flat-unity':
                return preprocessing.scale(data.astype(float))
            return data
        elif which in {'rflat', 'rflat-unity'}:
            return self.shuffle_loci().as_array(which[1:])

        # Counters
        elif which in {'count', 'count-unity', 'count-snp', 'count-center'}:
            count = (np.array(data) == 1).sum(axis=2)
            if which == 'count-unity':
                return preprocessing.scale(count.astype(float))
            elif which == 'count-snp':
                mu = count.mean(axis=0)
                p = mu / self.ploidy
                norm = np.sqrt(p * (1 - p))
                norm = np.where(norm, norm, 1)
                return (count - mu) / norm
            elif which == 'count-center':
                if self.ploidy % 2:
                    return count - self.ploidy / 2
                else:
                    return count - self.ploidy // 2
            else:
                return count

        raise ValueError('invalid conversion method: %r' % which)

    def drop_non_biallelic(self, **kwargs):
        """
        Creates a new population that remove all non-biallelic loci.

        Returns:
            A (population, removed) tuple with the new population and a list of
            of all dropped locus indexes.
        """
        bad_loci = self.statistics.non_biallelic()
        return self.drop_loci(bad_loci, **kwargs), bad_loci

    def force_biallelic(self, **kwargs):
        """
        Return a new population with forced biallelic data.

        If a locus has more than 2 alleles, the most common allele is picked
        as allele 1 and the alternate allele 2 comprises all the other alleles.
        """
        alleles_mapping = [biallelic_mapping(prob) for prob in self.freqs]
        return self.map_alleles(alleles_mapping, **kwargs)

    def sort_by_allele_freq(self, **kwargs):
        """
        Return a new population in which the index attributed to each allele
        in each locus is sorted by the frequency in the population. After that,
        allele 1 will be the most common, allele 2 is the second most common
        and so on.
        """
        alleles_mapping = [sorted_allele_mapping(prob) for prob in self.freqs]
        return self.map_alleles(alleles_mapping, **kwargs)

    @abc.abstractmethod
    def map_alleles(self, alleles_mapping, **kwargs):
        """
        Create new population reorganizing all allele values by the given
        list of allele values mappings.

        Args:
            alleles_mapping:
                A list with num_loci elements. Each element must be a mapping
                from the old allele values to the new ones. If an element is
                an empty dictionary, no remapping is done.
        """
        raise NotImplementedError('must be implemented in subclasses')

    def drop_loci(self, indexes, **kwargs):
        """
        Create a new population with all loci in the given indexes removed.
        """
        indexes = set(indexes)
        keep = np.array([i for i in range(self.num_loci) if i not in indexes])
        return self.keep_loci(keep, **kwargs)

    def drop_individuals(self, indexes, **kwargs):
        """
        Creates new population removing the individuals in the given indexes.
        """
        indexes = set(indexes)
        keep = np.array([i for i in range(self.size) if i not in indexes])
        return self.keep_individuals(keep, **kwargs)

    @abc.abstractmethod
    def keep_loci(self, indexes, **kwargs):
        """
        Creates a new population keeping only the loci in the given indexes.
        """
        raise NotImplementedError('must be implemented in subclasses')

    @abc.abstractmethod
    def keep_individuals(self, indexes, **kwargs):
        """
        Creates new population removing the individuals in the given indexes.
        """
        raise NotImplementedError('must be implemented in subclasses')

    def shuffle_loci(self, **kwargs):
        """
        Return a copy with shuffled contents of each locus.
        """

        pop = self.copy(**kwargs)
        for ind in pop:
            for loci in ind.data:
                np.random.shuffle(loci)
        return pop

    def copy(self, id=None):
        """
        Return a copy of population.
        """

        new = copy.copy(self)
        new.populations = copy.copy(self.populations)
        new._clear_caches()
        if id is not None:
            new.id = id
        return new
Esempio n. 4
0
class Transpyler(metaclass=SingletonMeta):
    """
    Base class for all new Transpylers.

    A transpyler is a singleton object.

    Very simple Python variations can be created by subclassing Transpyler::

        class PyBr(Transpyler):
            translations = {
                'para': 'for',            # single token translations
                'em': 'in',
                ('para', 'cada'): 'for',  # token sequence translations
                ('faça', ':'): ':',
            }

    Now we can create an object with exec(), eval() and compile() functions that
    handle the newly defined transpyler:

        pybr = PyBr()
        global_ns = {}

        pybr.exec('''
        x, y = 1, 1
        para cada i em [1, 2, 3, 4, 5] faça:
            x, y = y, x + y
        ''', global_ns)

        assert globals_ns['x'] == 8
        assert globals_ns['y'] == 13
    """

    # Cache builtins
    _compile = _compile
    _exec = _exec
    _eval = _eval
    _input = _input
    _print = _print

    # Factories and subclasses
    lexer_factory = Lexer
    info_factory = Info
    introspection_factory = Introspection
    namespace_factory = Namespace

    # Constants
    lang = 'en'
    has_turtle_functions = False
    turtle_backend = None
    standard_lib = None
    translations = None
    invalid_tokens = None
    language_version = '0.1.0'
    version = '0.1.0'
    codemirror_mode = 'python'
    file_extension = 'py'

    # Language info and introspection
    introspection = lazy(lambda self: self.introspection_factory(self))
    info = lazy(lambda self: self.info_factory(self))
    mimetypes = lazy(lambda self: [self.mimetype])
    mimetype = lazy(lambda self: 'text/x-%s' % self.name)
    link_docs = lazy(lambda self: "http://%s.readthedocs.io/%s/latest/" %
                     (self.name, self.lang))
    link_github = lazy(
        lambda self: "http://github.com/transpyler/%s/" % self.name)
    translate = lazy(lambda self: translator_factory(self.lang))

    # Display messages
    display_name = lazy(lambda self: self.name.title().replace('_', ' '))
    short_banner = lazy(lambda self: self.translate(
        '%s %s\n'
        'Type "help", "copyright" or "license" for more information.' %
        (self.display_name, self.version)))
    long_banner = lazy(lambda self: self.short_banner)
    use_short_banner = True

    # Lexer
    lexer = lazy(lambda self: self.lexer_factory(self))

    @lazy
    def name(self):
        cls_name = self.__class__.__name__.lower()
        if cls_name == 'transpyler':
            return 'transpyler'
        elif cls_name.endswith('transpyler'):
            return cls_name[:-10]
        else:
            return cls_name

    @lazy
    def namespace(self):
        return self.recreate_namespace()

    def __init__(self, **kwargs):
        self._forbidden = False
        for k, v in kwargs.items():
            setattr(self, k, v)
        self._has_init = False

        assert self.name, 'Name cannot be empty'

    def __repr__(self):
        return '<%s: %r>' % (self.__class__.__name__, self.name)

    #
    #  System functions
    #
    def init(self, ns=None):
        """
        Initializes transpyler runtime.

        Args:
            ns (dict):
                A dictionary with extra functions to be added to the
                globals namespace at runtime.
        """

        self.apply_curses()
        self.namespace.update(ns or {})

    def apply_curses(self):
        """
        Apply any required curses.

        Default implementation does nothing.
        """

    def compile(self,
                source,
                filename,
                mode,
                flags=0,
                dont_inherit=False,
                compile_function=None):
        """
        Similar to the built-in function compile() for Transpyled code.

        The additional compile_function() argument allows to define a function
        to replace Python's builtin compile().

        Args:
            source (str or code):
                Code to be executed.
            filename:
                File name associated with code. Use '<input>' for strings.
            mode:
                One of 'exec' or 'eval'. The second allows only simple
                statements that generate a value and is used by the eval()
                function.
            forbidden (bool):
                If true, initialize the forbidden lib functionality to enable
                i18n for Python builtins in C-level.
            compile_function (callable):
                A possible replacement for Python's built-in compile().
        """

        compile_function = compile_function or _compile
        source = self.transpile(source)
        return compile_function(source, filename, mode, flags, dont_inherit)

    def exec(self, source, globals=None, locals=None, exec_function=None):
        """
        Similar to the built-in function exec() for transpyled code.

        The additional exec_function() argument allows to define a function to
        replace Python's builtin compile().

        Args:
            source (str or code):
                Code to be executed.
            globals, locals:
                A globals/locals dictionary
            exec_function (callable):
                A possible replacement for Python's built-in exec().
        """

        exec_function = exec_function or _exec
        code = self.transpile(source) if isinstance(source, str) else source
        globals = {} if globals is None else globals
        globals.update(self.namespace)

        args = (globals, ) if locals is None else (globals, locals)
        return exec_function(code, *args)

    def eval(self, source, globals=None, locals=None, eval_function=None):
        """
        Similar to the built-in function eval() for transpyled code.

        The additional eval_function() argument allows to define a function to
        replace Python's builtin compile().

        Args:
            source (str or code):
                Code to be executed.
            globals, locals:
                A globals/locals dictionary
            eval_function (callable):
                A possible replacement for Python's built-in eval().
        """
        eval_function = eval_function or _eval
        code = self.transpile(source) if isinstance(source, str) else source
        globals = {} if globals is None else globals
        globals.update(self.namespace)

        args = (globals, ) if locals is None else (globals, locals)
        return eval_function(code, *args)

    def transpile(self, src):
        """
        Convert source to Python.
        """

        return self.lexer.transpile(src)

    def is_incomplete_source(self, src, filename="<input>", symbol="single"):
        """
        Test if a given source code is incomplete.

        Incomplete code may appear in users interactions when user is typing a
        multi line command:

        for x in range(10):
            ... should continue here, but user already pressed enter!
        """

        try:
            pytuga_src = self.transpile(src)
        except SyntaxError:
            return True
        return codeop.compile_command(pytuga_src, filename, symbol) is None

    @classmethod  # noqa: C901 (it only creates functions on a closure)
    def core_functions(cls):
        """
        Return an dictionary with a small namespace for the core functions in
        the transpyler API:

        * init: init runtime
        * compile: compile a string of source code
        * exec: execute a string of source code
        * eval: evaluate a string of source code and return the resulting object
        * transpile: transpile source code to Python
        * namespace: return a dictionary with builtin functions
        * is_complete_source: check if string can be executed as-is or if it
        requires additional lines of code in order to execute.
        """
        def init(ns=None):
            return cls().init(ns)

        def compile(source,
                    filename,
                    mode,
                    flags=0,
                    dont_inherit=False,
                    compile_function=None):
            return cls().compile(source,
                                 filename,
                                 mode,
                                 flags=flags,
                                 dont_inherit=dont_inherit,
                                 compile_function=compile_function)

        def exec(source, globals=None, locals=None, exec_function=None):
            return cls().exec(
                source,
                globals=globals,
                locals=locals,
                exec_function=exec_function,
            )

        def eval(source, globals=None, locals=None, eval_function=None):
            return cls().eval(
                source,
                globals=globals,
                locals=locals,
                eval_function=eval_function,
            )

        def transpile(src):
            return cls().transpile(src)

        def is_incomplete_source(src, filename="<input>", symbol="single"):
            return cls().is_incomplete_source(src, filename, symbol)

        def namespace(turtle=None):
            """
            Return a dictionary with all public functions.

            If turtle is given and it is either 'qt' or 'tk', it includes the
            corresponding turtle functions into the namespace.
            """
            transpyler = cls()
            transpyler.has_turtle_functions = turtle is not None
            transpyler.turtle_backend = turtle
            transpyler.init()
            return transpyler.namespace

        # Update docstrings
        init.__doc__ = cls.init.__doc__
        compile.__doc__ = cls.compile.__doc__
        exec.__doc__ = cls.exec.__doc__
        eval.__doc__ = cls.eval.__doc__
        transpile.__doc__ = cls.transpile.__doc__
        is_incomplete_source.__doc__ = cls.is_incomplete_source.__doc__

        return dict(
            init=init,
            compile=compile,
            exec=exec,
            eval=eval,
            transpile=transpile,
            is_incomplete_source=is_incomplete_source,
            namespace=namespace,
        )

    #
    # Console helpers
    #
    def console_banner(self, short=None):
        """
        Return a string with the console banner.
        """
        if short is None:
            short = self.use_short_banner

        if short:
            return self.short_banner
        return getattr(self, 'banner', self.short_banner)

    def recreate_namespace(self):
        """
        Recompute the default namespace for the transpyler object.
        """
        ns = self.namespace_factory(self)
        self.namespace = dict(ns)
        return self.namespace

    #
    # External execution
    #
    def start_console(self, console='auto'):
        """
        Starts a regular python console with the current transpyler.

        Args:
            console:
                Can be one of 'jupyter', 'console', 'qtconsole', 'auto'. This
                chooses the default console application. The default behavior
                (auto) is to try jupyter and fallback to console if it is
                not available.
        """

        # Select the console application
        if console == 'auto':
            try:
                import IPython  # noqa: F401
            except ImportError:
                console = 'console'
            else:
                console = 'jupyter'

        if console == 'qtconsole':
            from .jupyter import start_jupyter
            start_jupyter(transpyler=self, gui=True)

        elif console == 'jupyter':
            from .jupyter import start_jupyter
            start_jupyter(transpyler=self, gui=False)

        elif console == 'console':
            from .console import start_console
            start_console(transpyler=self)

        else:
            raise ValueError('invalid console: %r' % console)

    def start_notebook(self):
        """
        Starts a jupyter notebook with the current transpyler.
        """

        from .jupyter import start_notebook
        start_notebook(self)

    def start_qturtle(self):
        """
        Starts a QTurtle application with the current transpyler.
        """

        if not has_qt():
            raise SystemExit('PyQt5 is necessary to run the turtle '
                             'application.')

        from qturtle.mainwindow import start_application
        start_application(self)

    def start_main(self):
        """
        Starts the default main application.
        """

        import click

        @click.command()
        @click.option('--cli',
                      '-c',
                      is_flag=True,
                      default=False,
                      help='start gui-less console.')
        @click.option('--console',
                      is_flag=True,
                      default=False,
                      help='start a simple gui-less console.')
        @click.option('--notebook/--no-notebook',
                      '-n',
                      default=False,
                      help='starts notebook server.')
        def main(cli, notebook, console):
            if cli:
                return self.start_console('auto')
            if console:
                return self.start_console('console')
            if notebook:
                return self.start_notebook()

            if has_qt():
                return self.start_qturtle()
            else:
                msg = 'Could not start GUI. Do you have Qt installed?'
                click.echo(msg, err=True)
                return self.start_console('jupyter')

        return main()

    #
    # Callbacks: those methods are designed to be overridden by instances
    #
    def exit_callback(self):
        print('bye!')
        raise SystemExit(0)
 class A:
     x = lazy(lambda self: 42)
Esempio n. 6
0
from lazyutils import lazy
from sidekick import fn

fn_property = lambda x: property(fn(x)._)  # noqa: E731
fn_lazy = lambda x: lazy(fn(x)._)  # noqa: E731
Esempio n. 7
0
class Feedback(HasProgressMixin, models.TimeStampedModel,
               models.PolymorphicModel):
    """
    Feedback for user.

    Usually there will be one feedback per submission, but this figure may
    vary from case to case.
    """
    TITLE_OK = _('Correct answer!')
    TITLE_PARTIAL = _('Partially correct.')
    TITLE_WRONG = _('Wrong answer.')
    TITLE_NOT_GRADED = _('Not graded.')

    MESSAGE_OK = _('*Congratulations!* Your response is correct!')
    MESSAGE_OK_WITH_PENALTIES = _(
        'Your response is correct, but you did not achieved the maximum grade.'
    )
    MESSAGE_WRONG = _('I\'m sorry. Wrong response response!')
    MESSAGE_PARTIAL = _(
        'Your answer is partially correct: you achieved %(grade)d%% of '
        'the total grade.')
    MESSAGE_NOT_GRADED = _('Your response has not been graded yet!')

    submission = models.OneToOneField('Submission', related_name='feedback')
    manual_grading = models.BooleanField(
        default=True,
        help_text=_('True if feedback was created manually by a human.'))
    grader_user = models.ForeignKey(
        models.User,
        blank=True,
        null=True,
        help_text=_('User that performed the manual grading.'))
    given_grade_pc = models.DecimalField(
        _('percentage of maximum grade'),
        help_text=_(
            'This grade is given by the auto-grader and represents the grade '
            'for the response before accounting for any bonuses or penalties.'
        ),
        max_digits=6,
        decimal_places=3,
        validators=[grade_validator],
        blank=True,
        null=True,
    )
    final_grade_pc = models.DecimalField(
        _('final grade'),
        help_text=_(
            'Similar to given_grade, but can account for additional factors '
            'such as delay penalties or for any other reason the teacher may '
            'want to override the student\'s grade.'),
        max_digits=6,
        decimal_places=3,
        validators=[grade_validator],
        blank=True,
        null=True,
    )
    is_correct = models.BooleanField(default=False)
    progress = lazy(lambda x: x.submission.progress)

    def get_feedback_title(self):
        """
        Return a title summarizing the feedback result. The default set of
        titles come from the list:

            * Correct answer!
            * Partially correct.
            * Wrong answer.
            * Not graded.

        Different question types may define additional values to this list.
        """

        grade = self.given_grade_pc

        if grade == 100:
            return self.TITLE_OK
        elif grade is not None and grade > 0:
            return self.TITLE_PARTIAL
        elif grade == 0:
            return self.TITLE_WRONG
        else:
            return self.TITLE_NOT_GRADED

    def update_autograde(self):
        """
        Compute and set self.given_grade.

        This function may change other states in the feedback object, depending
        on the activity.
        """

        activity = self.activity
        submission = self.submission
        self.given_grade_pc = self.get_given_autograde(submission, activity)

    def get_given_autograde(self, submission, activity):
        """
        Atomic and testable version of autograde_update().

        Subclasses should overide this method.

        Args:
            submission: a submission object
            activity: the activity the submission refers to

        Returns:
            A numeric value between 0 and 100 with the assigned grade.
        """

        name = self.__class__.__name__
        raise ImproperlyConfigured(
            'Class %s must implement the .autograde() method.' % name)

    def update_final_grade(self):
        """
        Compute final grade applying all possible penalties and bonuses.
        """

        self.final_grade_pc = self.given_grade_pc
        if self.given_grade_pc == 100:
            self.is_correct = True

    def render_message(self, **kwargs):
        """
        Renders feedback message.
        """

        if self.is_correct and self.final_grade_pc >= self.given_grade_pc:
            msg = self.MESSAGE_OK
        elif self.is_correct and self.final_grade_pc < self.given_grade_pc:
            msg = self.MESSAGE_OK_WITH_PENALTIES
        elif not self.is_correct and self.given_grade_pc > 0:
            msg = self.MESSAGE_PARTIAL
        else:
            msg = self.MESSAGE_WRONG
        return p(msg, cls='cs-feedback-message').render(**kwargs)
Esempio n. 8
0
from sidekick import fn
from lazyutils import lazy
import importlib

fn_property = lambda x: property(fn(x)._)
fn_lazy = lambda x: lazy(fn(x)._)


class LazyModule:
    """
    A lazy module object.
    """
    def __init__(self, name):
        self.__path = name
        self.__mod = None

    def __load(self):
        self.__mod = importlib.import_module(self.__path)

    def __getattr__(self, item):
        if self.__mod is None:
            self.__load()
        value = getattr(self.__mod, item)
        setattr(self, item, value)
        return value


def lazy_module(mod):
    """
    Load a lazy module.
Esempio n. 9
0
class DocTemplate:
    """
    Treat an open document format (.ods, .odt, etc) as a Jinja2 template and
    execute a template transformation.
    """

    xmlns = {
        'text': 'urn:oasis:names:tc:opendocument:xmlns:text:1.0',
        'office': 'urn:oasis:names:tc:opendocument:xmlns:office:1.0',
        'table': 'urn:oasis:names:tc:opendocument:xmlns:table:1.0',
    }

    zipfile = lazy(lambda self: zipfile.ZipFile(self.path))

    def xml_tree(self):
        file = self.zipfile.open('content.xml')
        return ET.parse(file)

    def __init__(self, path):
        self.is_closed = False
        self.path = path

    def _check_open(self):
        if self.is_closed:
            raise RuntimeError('operation cannot be realized on closed file.')

    def render_template(self, namespace):
        """
        Apply template at content.xml and write back on the zipfile.
        """
        self._check_open()

        xml_root = self.xml_tree().getroot()
        non_rendered = [xml_root.find('office:body', self.xmlns)]

        while non_rendered:
            node = non_rendered.pop()
            node[:] = render_node(node, namespace)

        return ET.tounicode(xml_root)

    def render_at(self, namespace, dest):
        """
        Render template and save result on the given destination.
        """
        self._check_open()
        data = self.render_template(namespace)

        with zipfile.ZipFile(dest, 'w') as zip:
            for file in self.zipfile.namelist():
                if file == 'content.xml':
                    continue
                with zip.open(file, 'w') as dest_file:
                    with self.zipfile.open(file) as src_file:
                        dest_file.write(src_file.read())

            with zip.open('content.xml', 'w') as F:
                F.write(data.encode('utf8'))

        self.close()

    def close(self):
        """
        Close zipfile and flush all data to disk.
        """
        if not self.is_closed:
            self.zipfile.close()
        self.is_closed = True
Esempio n. 10
0
class Introspection:
    """
    Introspection facilities for a transpyled Transpyler.
    """

    #
    # Original python names and constants
    #
    py_constants = ['True', 'False', 'None']

    @lazy
    def py_exceptions(self):
        return [
            name for (name, value) in vars(_builtins).items()
            if isinstance(value, type) and issubclass(value, Exception)
        ]

    @lazy
    def py_types(self):
        return [
            name for (name, value) in vars(_builtins).items()
            if isinstance(value, type) and not issubclass(value, Exception)
        ]

    @lazy
    def py_functions(self):
        return [
            name for (name, value) in vars(_builtins).items()
            if name not in self.py_types and name not in self.py_exceptions
        ]

    @lazy
    def py_builtins(self):
        return self.py_types + self.py_functions

    py_submodules = []
    py_keywords = []

    #
    # Names derived from the transpyler
    #
    namespace = lazy(lambda self: self.transpyler.namespace)
    all_names = lazy(lambda self: list(self.namespace))
    constants = lazy(lambda self: [
        name for (name, value) in self.namespace.items()
        if isinstance(value, (int, float, bool))
    ])
    exceptions = lazy(lambda self: [
        name for (name, value) in self.namespace.items()
        if isinstance(value, type) and issubclass(value, Exception)
    ])
    types = lazy(lambda self: [
        name for (name, value) in self.namespace.items()
        if isinstance(value, type) and not issubclass(value, Exception)
    ])
    functions = lazy(lambda self: [
        name for (name, value) in self.namespace.items()
        if not isinstance(value, type) and callable(value)
    ])
    submodules = lazy(lambda self: [
        name for (name, value) in self.namespace.items()
        if isinstance(value, ModuleType)
    ])
    builtins = lazy(lambda self: self.functions + self.types)
    keywords = lazy(lambda self: self._extract_keywords())

    #
    # Combined lists
    #
    all_constants = lazy(
        lambda self: unique(self.constants + self.py_constants))
    all_exceptions = lazy(
        lambda self: unique(self.exceptions + self.py_exceptions))
    all_types = lazy(lambda self: unique(self.types + self.py_types))
    all_functions = lazy(
        lambda self: unique(self.functions + self.py_functions))
    all_submodules = lazy(
        lambda self: unique(self.submodules + self.py_submodules))
    all_builtins = lazy(lambda self: unique(self.builtins + self.py_builtins))
    all_keywords = lazy(lambda self: unique(self.keywords + self.py_keywords))

    def __init__(self, transpyler):
        self.transpyler = transpyler

    def _extract_keywords(self):
        keywords = set()
        for item in self.transpyler.translations:
            if isinstance(item, str):
                keywords.add(item)
            else:
                keywords.update(item)
        return sorted(keywords)
Esempio n. 11
0
class VoteStats:
    """
    Compute statistics, indexes, and perform generic mathematical analysis
    about votes and comments in a conversation.
    """

    n_users = lazy(lambda self: len(self.votes['user'].unique()))
    n_comments = lazy(lambda self: len(self.votes['comment'].unique()))
    shape = property(lambda self: self.votes.shape)

    def __init__(self, votes, n_users=None, n_comments=None):
        if not isinstance(votes, pd.DataFrame):
            votes = list(votes)
            votes = pd.DataFrame(votes, columns=['user', 'comment', 'choice'])

        keys = votes.keys()
        if not ('user' in keys and 'comment' in keys and 'choice' in keys):
            msg = (f'must be a dataframe with "user", "comment" and "choice" '
                   f'columns, got: {list(keys)}')
            raise ValueError(msg)
        self.votes = votes
        if n_users is not None:
            self.n_users = n_users
        if n_comments is not None:
            self.n_comments = n_comments

    @lazy
    def pivot_table(self):
        """
        Dataframe with users as index, comments as columns and votes as
        values.
        """
        votes = self.votes
        return votes.pivot_table(index='user', columns='comment', values='choice')

    # Dataframes with user and comment statistics
    def _datasets(self, which, n_max):
        data = self.votes
        return dict(
            n_votes=num_votes(data, which),
            n_skip=num_votes(data, which, choice=SKIP),
            n_agree=num_votes(data, which, choice=AGREE),
            n_disagree=num_votes(data, which, choice=DISAGREE),
            n_max=n_max,
            avg_all=average_vote(data, which),
            avg_valid=average_vote(data, which, drop_skip=True),
        )

    def comments(self, **kwargs):
        """
        Return a dataframe with information about comments.
        """
        kwargs = dict(self._datasets('comment', self.n_users), **kwargs)
        return base_stats(**kwargs)

    def users(self, **kwargs):
        """
        Return a dataframe with statistics about users.
        """
        data = self._datasets('user', self.n_comments)
        kwargs = dict(data, **kwargs)
        return base_stats(**kwargs)
Esempio n. 12
0
class PopulationBase(collections.Sequence, metaclass=abc.ABCMeta):
    """
    Base class for Population and MultiPopulation.

    Attrs:
        freqs:
            A list of :cls:`kpop.Prob` objects representing the probabilities
            of each loci.
        freqs_matrix:
            A full matrix with the shape (num individuals, max num of alleles)
            with the probability for each allele.
        freqs_vector:
            Frequencies for allele 1. This is more useful for biallelic data,
            since the frequency of the second allele is simply the complement.
        hfreqs_vector:
            Vector of frequencies of heterozygotes.
    """

    # General shape
    size = property(len)
    num_loci = lazy(lambda _: _[0].num_loci)
    ploidy = lazy(lambda _: _[0].ploidy)
    shape = property(lambda _: (_.size, _.num_loci, _.ploidy))
    data_size = fn_property(_.size * _.num_loci * _.ploidy)
    dtype = lazy(lambda _: np.dtype('uint8'))
    _shape_attrs = (
        'size',
        'num_loci',
        'ploidy',
        'shape',
        'data_size',
    )

    # Frequencies
    freqs = property(get_freqs, set_freqs)
    freqs_matrix = lazy(lambda _: freqs_to_matrix(_.freqs))
    freqs_vector = lazy(lambda _: np.ascontiguousarray(_.freqs_matrix[:, 0]))
    hfreqs_vector = lazy(hfreqs_vector)

    # Allele statistics
    allele_names = None
    is_biallelic = fn_lazy(_.num_alleles == 2)
    num_alleles = lazy(lambda _: max(max(freq) for freq in _.freqs))

    # Multi population
    is_multi_population = False
    num_populations = fn_property(lambda _: len(_.populations))

    # Missing data
    has_missing_data = property(lambda _: any(ind.has_missing for ind in _))
    missing_data_total = property(
        lambda _: sum(ind.missing_data_total for ind in _))
    missing_data_ratio = fn_property(_.missing_data_total / _.data_size)

    # Meta information
    individual_ids = lazy(lambda _: list(_.meta.index))

    # Special attributes. These will be inserted later via monkey patching
    populations = ()
    admixture = Admixture()
    clusterization = Clusterization()
    classification = Classification()
    io = Io()
    plot = Plot()
    projection = Projection()
    simulation = Simulation()
    statistics = Statistics()

    # Aliases
    admix = property(lambda self: self.admixture)
    cls = property(lambda self: self.classification)
    cluster = property(lambda self: self.clusterization)
    proj = property(lambda self: self.projection)
    sim = property(lambda self: self.simulation)
    stats = property(lambda self: self.statistics)

    # List of cacheable attributes
    _cacheable_attributes = (
        'has_missing',
        'missing_total',
        'missing_ratio',
        'is_biallelic',
        'num_alleles',
        'admixture',
        'clustering',
        'classification',
        'io',
        'plot',
        'projection',
        'simulation',
        'statistics',
    )

    @classmethod
    def random(cls,
               size=0,
               num_loci=0,
               alleles=2,
               ploidy=2,
               id=None,
               seed=None):
        """
        Creates a new random population.

        Args:
            size:
                Number of individuals. If a list of numbers is given, creates
                a Multipopulation object with sub-populations of the assigned
                sizes.
            num_loci:
                Number of loci in the genotype.
            alleles:
                Number of alleles for all loci.
            ploidy:
                Ploidy of genotype.
            min_prob:
                Minimum value for a frequency probability.

        Returns:
            A new population object.
        """
        if num_loci <= 0:
            raise ValueError('num_loci must be at least one!')

        is_multipopulation = isinstance(size, collections.Sequence)
        sizes = [size] if not is_multipopulation else size
        seeds = get_seeds(len(sizes), seed)

        # Create frequencies and data
        all_data = []
        all_freqs = [
            random_frequencies(num_loci, alleles, seed=k) for k in seeds
        ]
        for pre_seed, freqs, size in zip(seeds, all_freqs, sizes):
            data = []
            ind_seeds = get_seeds(size, pre_seed)
            for seed in ind_seeds:
                ind = random_individual_data(freqs, ploidy=ploidy, seed=seed)
                data.append(ind)
            all_data.append(np.array(data))

        # Return population
        if is_multipopulation:
            sub_populations = []
            for i in range(len(sizes)):
                id_i = None if id is None else '%s%s' % (id, i + 1)
                pop = kpop.Population(all_data[i],
                                      freqs=all_freqs[i],
                                      id=id_i,
                                      num_loci=num_loci,
                                      num_alleles=alleles,
                                      ploidy=ploidy)
                sub_populations.append(pop)
            return kpop.MultiPopulation(sub_populations, id=id)
        else:
            return kpop.Population(all_data[0],
                                   freqs=all_freqs[0],
                                   id=id,
                                   num_loci=num_loci,
                                   num_alleles=alleles,
                                   ploidy=ploidy)

    def __init__(self,
                 freqs=None,
                 allele_names=None,
                 id=None,
                 ploidy=None,
                 num_loci=None,
                 num_alleles=None):

        # Normalize frequencies
        self._init_freqs(freqs)

        # Fix num_loci from data
        if self._freqs is not None:
            self.num_loci = len(self._freqs)
            if num_loci is not None and num_loci != self.num_loci:
                raise ValueError('invalid value for num_loci')
        elif num_loci is not None:
            self.num_loci = num_loci

        # Save required attributes
        self.allele_names = allele_names
        self.id = id

        # Save optional given lazy attributes
        if ploidy is not None:
            self.ploidy = ploidy
        if num_alleles is not None:
            self.num_alleles = num_alleles

    def _init_freqs(self, freqs):
        if freqs is None:
            self._freqs = None
        elif len(freqs) == 0:
            raise ValueError('cannot initialize from empty frequencies')
        elif isinstance(freqs[0], collections.Mapping):
            self._freqs = [Prob(p) for p in freqs]
        else:
            freqs = np.asarray(freqs)

            if freqs.ndim == 2:
                self._freqs = [Prob(dict(enumerate(p, 1))) for p in freqs]
                self.freqs_matrix = np.array(freqs)
                self.num_alleles = freqs.shape[1]
            elif freqs.ndim == 1:
                self._freqs = [Prob({1: p, 2: 1 - p}) for p in freqs]
                self.freqs_vector = np.array(freqs)
                self.freqs_matrix = fill_freqs_vector(self.freqs_vector)
                self.num_alleles = 2
            else:
                raise ValueError('invalid frequency data')

    def __repr__(self):
        return self.io.render(max_loci=20, max_ind=10)

    def __str__(self):
        return self.io.render()

    def __eq__(self, other):
        if not isinstance(other, PopulationBase):
            return NotImplemented
        if self.shape != other.shape:
            return False
        return all(x == y for x, y in zip(self, other))

    def __getitem__(self, idx):
        if isinstance(idx, int):
            return self._getitem_by_index(idx)
        elif isinstance(idx, str):
            return self._getitem_by_label(idx)
        elif isinstance(idx, slice):
            return self._getslice(idx)
        elif isinstance(idx, np.ndarray) and idx.dtype.kind == 'i':
            return self.keep_individuals(idx)
        elif isinstance(idx, np.ndarray) and idx.dtype.kind == 'b':
            idx = np.arange(self.size)[idx]
            return self.keep_individuals(idx)
        else:
            typename = idx.__class__.__name__
            raise TypeError('invalid index type: %s' % typename)

    def _getitem_by_label(self, key):
        idx = self.meta.index.get_loc(key)
        return self._getitem_by_index(idx)

    def _getitem_by_index(self, idx):
        raise NotImplementedError

    def _getslice(self, slice):
        item = self._getitem_by_index
        data = [item(i) for i in range(*slice.indices(self.size))]
        return kpop.Population(data, id=self.id)

    def _population(self, *args, **kwargs):
        from kpop import Population
        return Population(*args, **kwargs)

    def _clear_caches(self):
        discard_attrs(self, self._cacheable_attributes)

    def _as_array(self):
        return NotImplementedError('must be implemented on subclasses')

    def as_array(self, which='raw'):
        """
        Convert to a numpy data array using the requested conversion method.
        This is a basic pre-processing step in many dimensionality reduction
        algorithms.

        Genotypes are categorical data and usually it doesn't make sense to
        treat the integer encoding used in kpop as ordinal data (there is
        no ordering implied when treating say, allele 1 vs allele 2 vs allele
        3).

        Conversion methods:
            * raw:
                An 3 dimensional array of (size, num_loci, ploidy) for raw
                genotype data. Each component represents the value of a single
                allele.
            * flat:
                Like raw, but flatten the last dimension into a (size,
                num_loci * ploidy) array. This creates a new feature per
                loci for each degree of ploidy in the data.
            * rflat:
                Flatten data, but first shuffle the positions of alleles at
                each loci. This is recommended if data does not carry reliable
                haplotype information.
            * raw-norm, flat-norm, rflat-norm:
                Normalized versions of "raw", "flat", and "rflat" methods. All
                components are rescaled with zero mean and unity variance.
            * count:
                Force conversion to biallelic data and counts the number of
                occurrences of the first allele. Most methdds will require
                normalization, so you probably should consider an specific
                method such as count-unity, count-snp, etc
            * count-norm:
                Normalized version of count scaled to zero mean and unity
                variance.
            * count-snp:
                Normalizes each feature using the standard deviation expected
                under the assumption of Hardy-Weinberg equilibrium. This
                procedure is described at Patterson et. al., "Population
                Structure and Eigenanalysis" and is recommended for SNPs
                subject to genetic drift.
            * count-center:
                Instead of normalizing, simply center data by subtracting half
                the ploidy to place it into a symmetric range. This
                normalization puts data into a cube with a predictable
                origin and range. For diploid data, the components will be
                either -1, 0, or 1.

        Returns:
            An ndarray with transformed data.
        """
        data_converter = DataConverter(self._as_array())
        return data_converter(which)

    def find_non_biallelic(self):
        """
        Finds all non-biallelic loci in population.
        """
        return self.statistics.non_biallelic()

    def drop_non_biallelic(self, **kwargs):
        """
        Creates a new population removing all non-biallelic loci.
        """
        bad_loci = self.find_non_biallelic()
        return self.drop_loci(bad_loci, **kwargs)

    def force_biallelic(self, **kwargs):
        """
        Return a new population with forced biallelic data.

        If a locus has more than 2 alleles, the most common allele is picked
        as allele 1 and the alternate allele 2 comprises all the other alleles.
        """
        alleles_mapping = [biallelic_mapping(prob) for prob in self.freqs]
        return self.map_alleles(alleles_mapping, **kwargs)

    def sort_by_allele_freq(self, **kwargs):
        """
        Return a new population in which the index attributed to each allele
        in each locus is sorted by the frequency in the population. After that,
        allele 1 will be the most common, allele 2 is the second most common
        and so on.
        """
        alleles_mapping = [sorted_allele_mapping(prob) for prob in self.freqs]
        return self.map_alleles(alleles_mapping, **kwargs)

    @abc.abstractmethod
    def map_alleles(self, alleles_mapping, **kwargs):
        """
        Create new population reorganizing all allele values by the given
        list of allele values mappings.

        Args:
            alleles_mapping:
                A list with num_loci elements. Each element must be a mapping
                from the old allele values to the new ones. If an element is
                an empty dictionary, no remapping is done.
        """
        raise NotImplementedError('must be implemented in subclasses')

    def drop_loci(self, indexes, **kwargs):
        """
        Create a new population with all loci in the given indexes removed.
        """
        indexes = set(indexes)
        keep = np.array([i for i in range(self.num_loci) if i not in indexes])
        return self.keep_loci(keep, **kwargs)

    def drop_individuals(self, indexes, **kwargs):
        """
        Creates new population removing the individuals in the given indexes.
        """
        indexes = set(indexes)
        keep = np.array([i for i in range(self.size) if i not in indexes])
        return self.keep_individuals(keep, **kwargs)

    def find_missing_data(self, axis=0, thresh=0.0):
        """
        Return the indexes for all all individuals or loci that have a
        proportion of missing data higher than the given threshold.

        Args:
            axis (0 or 1):
                If axis=0 or 'individuals' (default), it will scan individuals
                with a minimum amount of missing data values. If axis=1 or
                'loci', it will drop all loci with the minimum ammount of
                missing data.
            thresh (float, between 0 and 1):
                The maximum proportion of missing data tolerated.

        Returns:
            An array of indexes.
        """

        missing = self._as_array() == 0

        if axis in (0, 'individuals'):
            mask = np.mean(missing, axis=(1, 2)) > thresh
            return np.arange(self.size)[mask]
        elif axis in (1, 'loci'):
            mask = np.mean(missing, axis=(0, 2)) > thresh
            return np.arange(self.size)[mask]
        else:
            raise ValueError('invalid value for axis: %r' % axis)

    def drop_missing_data(self, axis=0, thresh=0.0, **kwargs):
        """
        Drop all individuals or loci that have a proportion of missing data
        higher than the given threshold.

        Args:
            axis (0 or 1):
                If axis=0 or 'individuals' (default), it will scan individuals
                with a minimum amount of missing data values. If axis=1 or
                'loci', it will drop all loci with the minimum ammount of
                missing data.
            thresh (float, between 0 and 1):
                The maximum proportion of missing data tolerated.

        Returns:
            A new population.
        """
        indexes = self.find_missing_data(axis, thresh)
        if axis in (0, 'individuals'):
            return self.drop_individuals(indexes, **kwargs)
        else:
            return self.drop_loci(indexes, **kwargs)

    @abc.abstractmethod
    def keep_loci(self, indexes, **kwargs):
        """
        Creates a new population keeping only the loci in the given indexes.
        """
        raise NotImplementedError('must be implemented in subclasses')

    @abc.abstractmethod
    def keep_individuals(self, indexes, **kwargs):
        """
        Creates new population removing the individuals in the given indexes.
        """
        raise NotImplementedError('must be implemented in subclasses')

    def shuffle_loci(self, **kwargs):
        """
        Return a copy with shuffled contents of each locus.
        """

        pop = self.copy(**kwargs)
        for ind in pop:
            for loci in ind.data:
                np.random.shuffle(loci)
        return pop

    def copy(self, id=None):
        """
        Return a copy of population.
        """

        new = copy.copy(self)
        new.populations = copy.copy(self.populations)
        new._clear_caches()
        if id is not None:
            new.id = id
        return new