コード例 #1
0
ファイル: configuration.py プロジェクト: xrotwang/BEASTling
    def build_language_filter(self):
        """
        Examines the values of various options, including self.languages.languages and
        self.languages.families, and constructs self.lang_filter.

        self.lang_filter is a Set object containing all ISO and glotto codes
        which are compatible with the provided settings (e.g. belong to the
        requested families).  This set is later used as a mask with data sets.
        Datapoints with language identifiers not in this set will not be used
        in an analysis.
        """
        # Load requirements
        if len(self.languages.families) == 1:
            log.warning(
                "value of 'families' has length 1: have you misspelled a filename?"
            )

        # Enforce minimum data constraint
        all_langs = set(
            itertools.chain(*[model.data.keys() for model in self.models]))
        N = sum([
            max([len(lang.keys()) for lang in model.data.values()])
            for model in self.models
        ])
        datapoint_props = {}
        for lang in all_langs:
            count = 0
            for model in self.models:
                count += len([x for x in model.data[lang].values() if x])
            datapoint_props[lang] = 1.0 * count / N
        self.sparse_languages = [
            l for l in all_langs
            if datapoint_props[l] < self.languages.minimum_data
        ]
コード例 #2
0
ファイル: basemodel.py プロジェクト: lmaurits/BEASTling
    def remove_unwanted_features(self):
        """
        Remove any undesirable features from the dataset, such as those with
        no data for the configured set of languages, constant features, etc.
        """

        bad_feats = []
        for f in self.features:

            # Exclude features with no data
            if self.valuecounts[f] == 0:
                log.info(
                    "Feature %s excluded because there are no datapoints for selected languages."
                    % f,
                    model=self)
                bad_feats.append(f)
                continue

            # Exclude features with lots of missing data
            missing_ratio = self.missing_ratios[f]
            if int(100 * (1.0 - missing_ratio)) < self.minimum_data:
                log.info(
                    "Feature %s excluded because of excessive missing data (%d%%)."
                    % (f, int(missing_ratio * 100)),
                    model=self)
                bad_feats.append(f)
                continue

            # Exclude constant features
            if self.valuecounts[f] == 1:
                if self.remove_constant_features:
                    self.constant_feature_removed = True
                    log.info(
                        "Feature %s excluded because its value is constant across selected "
                        "languages.  Set \"remove_constant_features=False\" in config to stop "
                        "this." % f,
                        model=self)
                    bad_feats.append(f)
                    continue
                else:
                    self.constant_feature = True

        for bad in bad_feats:
            self.features.remove(bad)
            for lang in self.languages:
                if bad in self.data[lang]:
                    self.data[lang].pop(bad)

        # Make sure there's something left
        if not self.features:
            raise ValueError("No features specified for model %s!" % self.name)
        self.features.sort()
        log.info("Using %d features from data source %s" %
                 (len(self.features), self.data_filename),
                 model=self)
        if self.constant_feature and self.rate_variation:
            log.warning(
                "Rate variation enabled with constant features retained in data. "
                "This *may* skew rate estimates for non-constant features.",
                model=self)
コード例 #3
0
ファイル: basemodel.py プロジェクト: lmaurits/BEASTling
 def process(self):
     """
     Subsample the data set to include only those languages and features
     which are compatible with the settings.
     """
     self.apply_language_filter()
     self.compute_feature_properties()
     self.remove_unwanted_features()
     self.load_rate_partition()
     if self.rate_partition:
         self.all_rates = sorted(list(set(self.rate_partition.values())))
     elif self.rate_variation or self.feature_rates:
         self.all_rates = self.features
     self.load_feature_rates()
     if self.rate_partition and not (self.feature_rates
                                     or self.rate_variation):
         log.warning(
             "Estimating rates for feature partitions because no fixed rates "
             "were provided, is this what you wanted?  Use rate_variation=True to make "
             "this implicit.",
             model=self)
         self.rate_variation = True
     self.compute_weights()
     if self.pruned:
         log.dependency("Pruned trees", "BEASTlabs", model=self)
コード例 #4
0
ファイル: configuration.py プロジェクト: xrotwang/BEASTling
    def load_glottolog_data(self):
        """
        Loads the Glottolog classification information from the appropriate
        newick file, parses it and stores the required datastructure in
        self.classification.
        """
        # Don't load if the analysis doesn't use it
        if not self.check_glottolog_required():
            return
        # Don't load if we already have - can this really happen?
        if self.glottolog_loaded:
            log.warning('Glottolog data has already been loaded')
            return
        self.glottolog_loaded = True

        self.classifications, glottocode2node, label2name = monophyly.classifications_from_newick(
            str(get_glottolog_data('newick', self.admin.glottolog_release)))

        # Load geographic metadata
        dialects = []
        for t in reader(get_glottolog_data('geo',
                                           self.admin.glottolog_release),
                        dicts=True):
            identifiers = [t['glottocode']] + t['isocodes'].split()
            if t['level'] == "dialect":
                dialects.append((t, identifiers))
            if t['macroarea']:
                for id_ in identifiers:
                    self.glotto_macroareas[id_] = t['macroarea']

            if t['latitude'] and t['longitude']:
                latlon = (float(t['latitude']), float(t['longitude']))
                for id_ in identifiers:
                    self.locations[id_] = latlon

        # Second pass of geographic data to handle dialects, which inherit
        # their parent language's location
        for t, identifiers in dialects:
            failed = False
            if t['glottocode'] not in glottocode2node:  # pragma: no cover
                # This may only happen for newick downloads of older Glottolog releases, where
                # possibly isolates may not be included.
                continue
            node = glottocode2node[t['glottocode']]
            ancestor = node.ancestor
            while label2name[ancestor.name][1] not in self.locations:
                if not ancestor.ancestor:
                    # We've hit the root without finding an ancestral node
                    # with location data!
                    failed = True
                    break
                else:
                    ancestor = ancestor.ancestor
            if failed:
                continue
            latlon = self.locations[label2name[ancestor.name][1]]
            for id_ in identifiers:
                self.locations[id_] = latlon
コード例 #5
0
 def handle_monophyly(self):
     """
     Construct a representation of the Glottolog monophyly constraints
     for the languages in self.languages.languages.  If the constraints are
     meaningful, create and store a Newick tree representation of
     them.  If the constraints are not meaningful, e.g. all
     languages are classified identically by Glottolog, then override
     the monophyly=True setting.
     """
     if (not self.languages.monophyly) or self.languages.monophyly_newick:
         return
     if len(self.languages.languages) < 3:
         # Monophyly constraints are meaningless for so few languages
         self.languages.monophyly = False
         log.info(
             "Disabling Glottolog monophyly constraints because there are only %d languages in "
             "analysis." % len(self.languages.languages))
         return
     # Build a list-based representation of the Glottolog monophyly constraints
     # This can be done in either a "top-down" or "bottom-up" way.
     langs = [l for l in self.languages.languages if l.lower() in self.classifications]
     if len(langs) != len(self.languages.languages):
         # Warn the user that some taxa aren't in Glottolog and hence will be
         # forced into an outgroup.
         missing_langs = [l for l in self.languages.languages if l not in langs]
         missing_langs.sort()
         missing_str = ",".join(missing_langs[0:3])
         missing_count = len(missing_langs)
         if missing_count > 3:
             missing_str += ",..."
         log.warning(
             "%d languages could not be found in Glottolog (%s). Monophyly constraints will "
             "force them into an outgroup." % (missing_count, missing_str))
     if self.languages.monophyly_end_depth is not None:
         # A power user has explicitly provided start and end depths
         start = self.languages.monophyly_start_depth
         end = self.languages.monophyly_end_depth
     elif self.languages.monophyly_direction == "top_down":
         # Compute start and end in a top-down fashion
         start = self.languages.monophyly_start_depth
         end = start + self.languages.monophyly_levels
     elif self.languages.monophyly_direction == "bottom_up":
         # Compute start and end in a bottom-up fashion
         classifications = [self.classifications[name.lower()] for name in langs]
         end = max([len(c) for c in classifications]) - self.languages.monophyly_start_depth
         start = max(0, end - self.languages.monophyly_levels)
     struct = monophyly.make_structure(self.classifications, langs, depth=start, maxdepth=end)
     # Make sure this struct is not pointlessly flat
     if not monophyly.check_structure(struct):
         self.languages.monophyly = False
         log.info(
             "Disabling Glottolog monophyly constraints because all languages in the analysis "
             "are classified identically.")
     # At this point everything looks good, so keep monophyly on and serialise the "monophyly structure" into a Newick tree.
     self.languages.monophyly_newick = monophyly.make_newick(struct)
コード例 #6
0
ファイル: basemodel.py プロジェクト: xrotwang/BEASTling
    def build_feature_filter(self):
        """
        Create the self.feature_filter attribute, which is a set of feature
        names that functions analogously to Configuration.lang_filter
        attribute.
        """
        if self.features == ["*"]:
            self.features = set()
            for lang_features in self.data.values():
                self.features |= set(lang_features.keys())
            self.features = list(self.features)
        if self.exclusions:
            self.features = [
                f for f in self.features if f not in self.exclusions
            ]
        self.feature_filter = set(self.features)

        if self.reconstruct == ["*"]:
            self.reconstruct = self.features[:]
        elif self.reconstruct:
            fail_to_find = [
                f for f in self.reconstruct if f not in self.features
            ]
            if fail_to_find:
                log.warning(
                    "Features {:} not found, cannot be reconstructed.".format(
                        fail_to_find),
                    model=self)
            self.reconstruct = [
                f for f in self.reconstruct if f in self.features
            ]
            log.info("Features {:} will be reconstructed."
                     "".format(self.reconstruct),
                     model=self)
            # Note: That is a lie. Features can still be filtered out by
            # subsequent decisions, eg. because they are constant.
        else:
            self.reconstruct = []

        if self.reconstruct_at == ["*"]:
            self.reconstruct_at = None
            self.treewide_reconstruction = True
        elif self.reconstruct_at:
            if len(self.reconstruct_at) > 1:
                raise ValueError(
                    "Cannot currently reconstruct at more than one location.")
            for f in self.reconstruct_at:
                if f not in self.config.language_group_configs:
                    raise KeyError(
                        "Language group {:} is undefined. Valid groups are: {:}"
                        .format(f,
                                ", ".join(self.config.language_groups.keys())))
        elif self.reconstruct:
            self.reconstruct_at = ["root"]
コード例 #7
0
ファイル: basemodel.py プロジェクト: lmaurits/BEASTling
    def load_feature_rates(self):
        """
        Load relative feature rates from .csv file.
        """
        if self.feature_rates:
            fname = str(self.feature_rates)
            res = {}
            for line in iterlines(self.feature_rates,
                                  name='feature rates file'):
                feature, rate = line.split(",")
                feature = feature.strip()
                # Skip irrelevant things
                if feature not in self.all_rates:
                    continue
                rate = float(rate.strip())
                res[feature] = rate
            self.feature_rates = res

            if not all(
                (rate in self.feature_rates for rate in self.all_rates)):
                log.warning(
                    "Rate file %s does not contain rates for every "
                    "feature/partition.  Missing rates will default to 1.0, please check that "
                    "this is okay." % fname,
                    model=self)
            if not self.feature_rates:
                log.warning(
                    "Could not find any valid feature or partition rates "
                    "in the file %s, is this the correct file for this analysis?"
                    % fname,
                    model=self)
                return
            norm = sum(self.feature_rates.values()) / len(
                self.feature_rates.values())
            for f in self.feature_rates:
                self.feature_rates[f] /= norm
コード例 #8
0
ファイル: configuration.py プロジェクト: xrotwang/BEASTling
    def __init__(self,
                 basename="beastling",
                 configfile=None,
                 stdin_data=False,
                 prior=False,
                 force_glottolog_load=False):
        """
        Set all options to their default values and then, if a configuration
        file has been provided, override the default values for those options
        set in the file.
        """
        cli_params = {k: v for k, v in locals().items()}

        # Options set by the user, with default values
        """A dictionary whose keys are glottocodes or lowercase Glottolog clade names, and whose values are length-2 tuples of flatoing point dates (lower and upper bounds of 95% credible interval)."""
        self.calibration_configs = {}
        """A list of `sections.Clock`s, each of which specifies the configuration for a single clock model."""
        self.clocks = []
        self.clocks_by_name = {}
        """An ordered dictionary whose keys are language group names and whose values are language group definitions."""
        self.language_groups = {}
        """A dictionary giving names to arbitrary collections of tip languages."""
        """A list of dictionaries, each of which specifies the configuration for a single evolutionary model."""
        self.models = []
        self.stdin_data = stdin_data
        """A boolean value, controlling whether or not to read data from stdin as opposed to the file given in the config."""

        # Glottolog data
        self.glottolog_loaded = False
        self.force_glottolog_load = force_glottolog_load
        self.classifications = {}
        self.glotto_macroareas = {}
        self.locations = {}

        # Options set from the command line interface
        self.prior = prior

        # Stuff we compute ourselves
        self.processed = False
        self._files_to_embed = []

        # Now read the config ...
        self.cfg = ConfigParser(interpolation=None)
        self.cfg.optionxform = str
        if configfile:
            if isinstance(configfile, dict):
                self.cfg.read_dict(configfile)
            else:
                if isinstance(configfile, str):
                    configfile = (configfile, )
                self.cfg.read([str(c) for c in configfile])

        # ... and process the sections:
        # [geography]
        if 'geography' in self.cfg.sections():
            self.geography = sections.Geography.from_config(
                cli_params, 'geography', self.cfg)
        else:
            self.geography = None

        # [calibration]
        for clade, calibration in sections.Calibration.from_config(
            {}, "calibration", self.cfg).options.items():
            self.calibration_configs[clade] = calibration

        # [model ...] and [clock ...]
        for prefix, cfg_cls in [('clock', sections.Clock),
                                ('model', sections.Model)]:
            for section in [
                    s for s in self.cfg.sections()
                    if s.lower().startswith(prefix)
            ]:
                getattr(self, prefix + 's').append(
                    cfg_cls.from_config({}, section, self.cfg))

        # Make sure analysis is non-empty
        if not (self.models or self.geography):
            raise ValueError(
                "Config file contains no model sections and no geography section."
            )

        # [geo_priors]
        if self.cfg.has_section("geo_priors"):
            if not self.geography:
                raise ValueError(
                    "Config file contains geo_priors section but no geography section."
                )
            for clade, klm in sections.GeoPriors.from_config(
                {}, 'geo_priors', self.cfg).iterpriors():
                if clade not in self.geography.sampling_points:
                    self.geography.sampling_points.append(clade)
                self.geography.priors[clade] = klm

        # [admin]
        self.admin = sections.Admin.from_config(cli_params, 'admin', self.cfg)
        # [mcmc]
        self.mcmc = sections.MCMC.from_config(
            cli_params, 'mcmc' if self.cfg.has_section('mcmc') else 'MCMC',
            self.cfg)
        # [languages]
        self.languages = sections.Languages.from_config(
            cli_params, 'languages', self.cfg)
        # [language_groups]
        self.language_group_configs = sections.LanguageGroups.from_config(
            {}, 'language_groups', self.cfg).options

        # If log_every was not explicitly set to some non-zero
        # value, then set it such that we expect 10,000 log
        # entries
        if not self.admin.log_every:
            # If chainlength < 10000, this results in log_every = zero.
            # This causes BEAST to die.
            # So in this case, just log everything.
            self.admin.log_every = self.mcmc.chainlength // 10000 or 1

        if self.geography \
                and [p for p in self.geography.sampling_points if p.lower() != "root"] \
                and self.languages.sample_topology and not self.languages.monophyly:
            log.warning(
                "Geographic sampling and/or prior specified for clades other than root, but tree "
                "topology is being sampled without monophyly constraints. BEAST may crash."
            )
コード例 #9
0
ファイル: configuration.py プロジェクト: xrotwang/BEASTling
    def link_clocks_to_models(self):
        """
        Ensures that for each model object in self.models, the attribute
        "clock" is a reference to one of the clock objects in self.clocks.
        Also determine which clock to estimate the mean of.
        """
        for model in self.all_models:
            if model.clock:
                # User has explicitly specified a clock
                if model.clock not in self.clocks_by_name:
                    raise ValueError(
                        "Unknown clock '%s' for model section '%s'." %
                        (model.clock, model.name))
                model.clock = self.clocks_by_name[model.clock]
            elif model.name in self.clocks_by_name:
                # Clock is associated by a common name
                model.clock = self.clocks_by_name[model.name]
            else:
                # No clock specification - use default
                model.clock = self.clocks_by_name["default"]
            model.clock.is_used = True

        # Disable pruned trees in models using RLCs
        for model in self.models:
            if model.pruned and isinstance(model.clock,
                                           random_clock.RandomLocalClock):
                model.pruned = False
                log.info(
                    "Disabling pruned trees because associated clock %s is a "
                    "RandomLocalClock. Pruned trees are currently only compatible with "
                    "StrictClocks and RelaxedClocks." % model.clock.name,
                    model=model)

        # Warn user about unused clock(s) (but not the default clock)
        for clock in self.clocks:
            if clock.name != "default" and not clock.is_used:
                log.info(
                    "Clock %s is not being used. Change its name to \"default\", or explicitly "
                    "associate it with a model." % clock.name)

        # Remove unused clocks from the master clock list
        self.clocks = [c for c in self.clocks if c.is_used]

        # Get a list of model (i.e. non-geo) clocks for which the user has not
        # indicated a preference on whether the mean should be estimated
        free_clocks = list(
            set([
                m.clock for m in self.models
                if m.clock.is_used and m.clock.estimate_rate == None
            ]))
        if free_clocks:
            # To begin with, estimate all free clocks
            for clock in free_clocks:
                clock.estimate_rate = True
            # But if the tree is arbitrary, then fix one free clock, unless the
            # user has fixed an un-free clock
            if self.arbitrary_tree and all(
                [m.clock.estimate_rate for m in self.models]):
                free_clocks[0].estimate_rate = False
                log.info(
                    "Clock \"%s\" has had it's mean rate fixed to 1.0. Tree branch lengths are in "
                    "units of expected substitutions for features in models using this "
                    "clock." % free_clocks[0].name)

        # Determine whether or not precision-scaling is required
        if self.geography:
            self.geo_model.scale_precision = False
            geo_clock = self.geo_model.clock
            for m in self.models:
                if m.clock == geo_clock:
                    log.warning(
                        "Geography model is sharing a clock with one or more data models. This may lead to a bad fit."
                    )
                    self.geo_model.scale_precision = True
                    break
            # If geo has it's own clock, estimate the mean
            if not self.geo_model.scale_precision:
                self.geo_model.clock.estimate_rate = True
コード例 #10
0
ファイル: configuration.py プロジェクト: xrotwang/BEASTling
    def process(self):
        """
        Prepares a Configuration object for being passed to the BeastXml

        constructor.

        This method checks the values of all options for invalid or ambiguous
        settings, internal consistency, etc.  Information is read from
        external files as required.  If this method returns without raising
        any exceptions then this should function as a guarantee that a
        BeastXml object can be instantiated from this Configuration with no
        problems.
        """
        if self.processed:
            log.warning('Configuration has already been processed')
            return

        # Add dependency notices if required
        if self.languages.monophyly and not self.languages.starting_tree:
            log.dependency("ConstrainedRandomTree", "BEASTLabs")
        if self.mcmc.path_sampling:
            log.dependency("Path sampling", "MODEL_SELECTION")

        self.load_glottolog_data()
        self.load_user_geo()
        self.instantiate_models()
        self.build_language_filter()
        self.process_models()
        self.build_language_list()
        self.define_language_groups()
        self.handle_monophyly()
        self.instantiate_calibrations()
        # At this point, we can tell whether or not the tree's length units
        # can be treated as arbitrary
        self.arbitrary_tree = self.languages.sample_branch_lengths and not self.calibrations

        # We also know what kind of tree prior we need to have –
        # instantiate_calibrations may have changed the type if tip
        # calibrations exist.
        self.treeprior = {
            "uniform": treepriors.UniformTree,
            "yule": treepriors.YuleTree,
            "birthdeath": treepriors.BirthDeathTree,
            "coalescent": CoalescentTree
        }[self.languages.tree_prior]()

        # Now we can set the value of the ascertained attribute of each model
        # Ideally this would happen during process_models, but this is impossible
        # as set_ascertained() relies upon the value of arbitrary_tree defined above,
        # which itself depends on process_models().  Ugly...
        for m in self.models:
            m.set_ascertained()
        self.instantiate_clocks()
        self.link_clocks_to_models()
        self.processed = True

        # Decide whether or not to log trees
        if (self.languages.starting_tree and not self.languages.sample_topology
                and not self.languages.sample_branch_lengths
                and all([c.is_strict for c in self.clocks if c.is_used])):
            self.tree_logging_pointless = True
            log.info(
                "Tree logging disabled because starting tree is known and fixed and all clocks "
                "are strict.")
        else:
            self.tree_logging_pointless = False