def __init__(self, name, dir_=None, default=None, **kw): """Initialization. :param name: Basename for the config file (suffix .ini will be appended). :param default: Default content of the config file. """ INI.__init__(self, kw, allow_no_value=True) self.name = name config_dir = Path(dir_ or CONFIG_DIR) if default: if isinstance(default, text_type): self.read_string(default) #elif isinstance(default, (dict, OrderedDict)): # self.read_dict(default) cfg_path = config_dir.joinpath(name + '.ini') if cfg_path.exists(): assert cfg_path.is_file() self.read(cfg_path.as_posix()) else: if not config_dir.exists(): try: config_dir.mkdir() except OSError: # pragma: no cover # this happens when run on travis-ci, by a system user. pass if config_dir.exists(): self.write(cfg_path.as_posix()) self.path = cfg_path
def from_lff(cls, path, name_and_codes, level, dry_run=False): assert isinstance(level, Level) lname, codes = name_and_codes.split('[', 1) lname = lname.strip() glottocode, isocode = codes[:-1].split('][') if not glottocode: glottocode = Glottocode.from_name(lname, dry_run=dry_run) lineage = [] if path: for i, comp in enumerate(path.split('], ')): if comp.endswith(']'): comp = comp[:-1] name, id_ = comp.split(' [', 1) _level = Level.family if level == Level.dialect: _level = Level.language if i == 0 else Level.dialect lineage.append((name, id_, _level)) cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=lname, glottocode=glottocode))) res = cls(cfg, lineage) res.level = level if isocode: res.iso = isocode return res
def from_name_id_level(cls, name, id, level, **kw): cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=name, glottocode=id))) res = cls(cfg, []) res.level = Level(level) for k, v in kw.items(): setattr(res, k, v) return res
def from_name_id_level(cls, tree, name, id, level, **kw): cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=name))) res = cls(cfg, kw.pop('lineage', []), id_=Glottocode(id), tree=tree) res.level = Level.get(level) for k, v in kw.items(): setattr(res, k, v) return res
def test_encoding(tmppath): ini = tmppath / 'test.ini' write_text(ini, '[äöü]\näöü = äöü', encoding='cp1252') with pytest.raises(UnicodeDecodeError): INI.from_file(ini) assert INI.from_file(ini, encoding='cp1252')['äöü']['äöü'] == 'äöü'
def from_name_id_level(cls, tree, name, id, level, **kw): cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=name))) res = cls(cfg, kw.pop('lineage', []), id_=Glottocode(id), tree=tree) for k, v in kw.items(): setattr(res, k, v) # Note: Setting the level behaves differently when `_api` is available, so must be done # after all other attributes are initialized. res.level = level return res
def from_name_id_level(cls, tree, name, id, level, **kw): """ This method is used in `pyglottolog.lff` to instantiate `Languoid` s for new nodes encountered in "lff"-format trees. """ cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=name))) res = cls(cfg, kw.pop('lineage', []), id_=Glottocode(id), tree=tree) for k, v in kw.items(): setattr(res, k, v) # Note: Setting the level behaves differently when `_api` is available, so must be done # after all other attributes are initialized. res.level = level return res
def from_dir(cls, directory: pathlib.Path, nodes=None, _api=None, **kw): """ Create a `Languoid` from a directory, named with the Glottocode and containing `md.ini`. This method is used by :class:`pyglottolog.Glottolog` to read `Languoid`s from the repository's `languoids/tree` directory. """ if _api and _api.cache and directory.name in _api.cache: return _api.cache[directory.name] if nodes is None: nodes = {} cfg = INI.from_file(directory.joinpath(INFO_FILENAME), interpolation=None) lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l_ = Languoid.from_dir(parent, nodes=nodes, _api=_api, **kw) nodes[id_] = (l_.name, l_.id, l_.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory, _api=_api, **kw) nodes[res.id] = (res.name, res.id, res.level) return res
def test_extractor(self): config = self.make_cfg( [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")]) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) self.assertTrue(bool(self._extract(xmlfile))) config = self.make_cfg({ 'admin': { 'basename': 'abcdefg' }, 'model': { 'model': 'mk', 'data': data_path('basic.csv').as_posix() } }) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') self.assertTrue(p.exists()) cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) self.assertEqual(cfg['admin']['basename'], 'abcdefg') self.assertEqual(cfg['model']['model'], 'mk') fname = self.tmp.joinpath('test.xml') datafile = self.tmp.joinpath(('test.csv')) self.assertFalse(datafile.exists()) with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = self._extract(fname) self.assertIn(datafile.name, ''.join(res))
def from_path(cls, path: typing.Union[str, pathlib.Path], api=None) -> 'BibFiles': """BibTeX files from `<path>/bibtex/*.bib` if listed in `<path>/BIBFILES.ini`.""" if not isinstance(path, pathlib.Path): path = pathlib.Path(path) ini = INI.from_file(path / 'BIBFILES.ini', interpolation=None) return cls(cls._iterbibfiles(ini, path / 'bibtex', api=api))
def get_ini(fname, **kw): fname = Path(fname) if not fname.exists(): # For old-style (<=3.4) repository layout we ship the config data with pyglottolog: name = fname.name if fname.name != 'hhtype.ini' else 'document_types.ini' fname = Path(__file__).parent / name assert fname.exists() return INI.from_file(fname, **kw)
def test_extractor(self): config = self.make_cfg( [config_path(f).as_posix() for f in ("admin", "mk", "embed_data")]) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) self.assertTrue(bool(self._extract(xmlfile))) config = self.make_cfg({ 'admin': {'basename': 'abcdefg'}, 'model': { 'model': 'mk', 'data': data_path('basic.csv').as_posix()}}) xml = beastling.beastxml.BeastXml(config) xmlfile = self.tmp.joinpath("beastling.xml") xml.write_file(xmlfile.as_posix()) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') self.assertTrue(p.exists()) cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) self.assertEqual(cfg['admin']['basename'], 'abcdefg') self.assertEqual(cfg['model']['model'], 'mk') fname = self.tmp.joinpath('test.xml') datafile = self.tmp.joinpath(('test.csv')) self.assertFalse(datafile.exists()) with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = self._extract(fname) self.assertIn(datafile.name, ''.join(res))
def test_extractor(config_factory, tmppath, data_dir): config = config_factory("admin", "mk", "embed_data") xml = beastling.beastxml.BeastXml(config) xmlfile = str(tmppath / "beastling.xml") xml.write_file(xmlfile) assert bool(_extract(xmlfile)) config = config_factory({ 'admin': {'basename': 'abcdefg'}, 'model model': { 'model': 'mk', 'data': str(data_dir / 'basic.csv')}}) xml = beastling.beastxml.BeastXml(config) xmlfile = str(tmppath / "beastling.xml") xml.write_file(xmlfile) beastling.extractor.extract(xmlfile) p = Path('abcdefg.conf') assert p.exists() cfg = INI(interpolation=None) cfg.read(p.as_posix()) remove(p) assert cfg['admin']['basename'] == 'abcdefg' assert cfg['model model']['model'] == 'mk' fname = tmppath / 'test.xml' datafile = tmppath / 'test.csv' assert not datafile.exists() with fname.open('w', encoding='utf8') as fp: fp.write("""<?xml version="1.0" encoding="UTF-8"?> <r> <!--%s %s [admin] [model model] --> <!--%s:%s--> </r> """ % (beastling.extractor._generated_str, beastling.extractor._config_file_str, beastling.extractor._data_file_str, datafile.as_posix())) res = _extract(fname) assert datafile.name in ''.join(res)
def __init__(self, api): ini = INI.from_file(api.references_path('BIBFILES.ini'), interpolation=None) res = [] for sec in ini.sections(): if sec.endswith('.bib'): fname = api.references_path('bibtex', sec) if not fname.exists(): # pragma: no cover raise ValueError('invalid bibtex file referenced in BIBFILES.ini') res.append(BibFile(fname=fname, **ini[sec])) super(BibFiles, self).__init__(res) self._map = {b.fname.name: b for b in self}
def from_lff(cls, path, name_and_codes, level): lname, codes = name_and_codes.split('[', 1) lname = lname.strip() glottocode, isocode = codes[:-1].split('][') lineage = [] for i, comp in enumerate(path.split('], ')): if comp.endswith(']'): comp = comp[:-1] name, id_ = comp.split(' [', 1) if id_ != '-isolate-': _level = 'family' if level == 'dialect': _level = 'language' if i == 0 else 'dialect' lineage.append((name, id_, _level)) cfg = INI() cfg.read_dict(dict(core=dict(name=lname, glottocode=glottocode, level=level))) res = cls(cfg, lineage) if isocode: res.iso = isocode return res
def from_lff(cls, path, name_and_codes, level): lname, codes = name_and_codes.split('[', 1) lname = lname.strip() glottocode, isocode = codes[:-1].split('][') lineage = [] for i, comp in enumerate(path.split('], ')): if comp.endswith(']'): comp = comp[:-1] name, id_ = comp.split(' [', 1) if id_ != '-isolate-': _level = 'family' if level == 'dialect': _level = 'language' if i == 0 else 'dialect' lineage.append((name, id_, _level)) cfg = INI() cfg.read_dict( dict(core=dict(name=lname, glottocode=glottocode, level=level))) res = cls(cfg, lineage) if isocode: res.iso = isocode return res
def from_ini(cls, ini, nodes=None): nodes = nodes or {} ini = Path(ini) directory = ini.parent cfg = INI(interpolation=None) cfg.read(ini.as_posix(), encoding='utf8') lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l = Languoid.from_dir(parent, nodes=nodes) nodes[id_] = (l.name, l.id, l.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory) nodes[res.id] = (res.name, res.id, res.level) return res
def from_ini(cls, ini, nodes={}): if not isinstance(ini, Path): ini = Path(ini) directory = ini.parent cfg = INI() cfg.read(ini.as_posix(), encoding='utf8') lineage = [] for parent in directory.parents: id_ = parent.name.split('.')[-1] assert id_ != directory.name.split('.')[-1] if not cls.id_pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l = Languoid.from_dir(parent, nodes=nodes) nodes[id_] = (l.name, l.id, l.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage))) nodes[res.id] = (res.name, res.id, res.level) return res
def from_ini(cls, ini, nodes=None): if nodes is None: nodes = {} ini = Path(ini) directory = ini.parent cfg = INI(interpolation=None) cfg.read(ini.as_posix(), encoding='utf8') lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l = Languoid.from_dir(parent, nodes=nodes) nodes[id_] = (l.name, l.id, l.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory) nodes[res.id] = (res.name, res.id, res.level) return res
def make(glottolog, concepticon): ini = INI() ini.read_dict( {'paths': { 'concepticon': concepticon, 'glottolog': glottolog }}) p = str(tmpdir.join('config.ini')) ini.write(p) return p
def write_config(comment_text, overwrite): lines = comment_text.split("\n") assert lines[1] in (_config_file_str, _proggen_str) if lines[1] == _proggen_str: return "Original configuration was generated programmatically, no configuration to extract." config_text = "\n".join(lines[2:]) p = INI() p.read_string(config_text) filename = p.get("admin", "basename") \ if p.has_option("admin", "basename") else 'beastling' filename = Path(filename + '.conf') if filename.exists() and not overwrite: return "BEASTling configuration file %s already exists! Run beastling with the --overwrite option if you wish to overwrite it.\n" % filename if not filename.parent.exists(): filename.parent.mkdir() p.write(filename) return "Wrote BEASTling configuration file %s.\n" % filename
def write_config(comment_text, overwrite): lines = comment_text.split("\n") lines = [l for l in lines if l] assert lines[1] in (_config_file_str, _proggen_str) if lines[1] == _proggen_str: return "Original configuration was generated programmatically, no configuration to extract." truths = [_do_not_edit_str in line for line in lines] if any(truths): lines = lines[0:truths.index(True)] config_text = "\n".join(lines[2:]) p = INI() p.read_string(config_text) filename = p.get("admin", "basename") \ if p.has_option("admin", "basename") else 'beastling' filename = Path(filename + '.conf') if filename.exists() and not overwrite: return "BEASTling configuration file %s already exists! Run beastling with the --overwrite option if you wish to overwrite it.\n" % filename if not filename.parent.exists(): filename.parent.mkdir() p.write(filename) return "Wrote BEASTling configuration file %s.\n" % filename
def from_dir(cls, directory, nodes=None, **kw): if nodes is None: nodes = {} cfg = INI.from_file(directory.joinpath(INFO_FILENAME), interpolation=None) lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l_ = Languoid.from_dir(parent, nodes=nodes, **kw) nodes[id_] = (l_.name, l_.id, l_.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory, **kw) nodes[res.id] = (res.name, res.id, res.level) return res
def test_INI(tmppath): ini = INI() ini.set('äüü', 'äöü', ('ä', 'ö', 'ü')) ini.set('a', 'b', 5) assert ini['a'].getint('b') == 5 ini.set('a', 'c', None) assert 'c' not in ini['a'] assert 'ä\n' in ini.write_string() assert len(ini.getlist('äüü', 'äöü')) == 3 mt = '- a\n - aa\n - ab\n- b' ini.settext('text', 'multi', mt) tmp = tmppath / 'test' ini.write(tmp.as_posix()) with tmp.open(encoding='utf8') as fp: res = fp.read() assert 'coding: utf-8' in res ini2 = INI.from_file(tmp) assert ini2.gettext('text', 'multi') == mt assert ini2.write_string() == ini.write_string()
def test_INI(self): from clldutils.inifile import INI ini = INI() ini.set('äüü', 'äöü', ('ä', 'ö', 'ü')) ini.set('a', 'b', 5) self.assertEqual(ini['a'].getint('b'), 5) ini.set('a', 'c', None) self.assertNotIn('c', ini['a']) self.assertIn('ä\n', ini.write_string()) self.assertEqual(len(ini.getlist('äüü', 'äöü')), 3) tmp = self.tmp_path('test') ini.write(tmp.as_posix()) with tmp.open(encoding='utf8') as fp: res = fp.read() self.assertIn('coding: utf-8', res) ini2 = INI.from_file(tmp) self.assertEqual(ini2.write_string(), ini.write_string())
def from_name_id_level(cls, name, id, level): cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=name, glottocode=id, level=level))) return cls(cfg, [])
def __init__(self, api): ini = INI.from_file(api.references_path('hhtype.ini'), interpolation=None) self._types = sorted([HHType(s, ini) for s in ini.sections()], reverse=True) self._type_by_id = {t.id: t for t in self._types}
class Configuration(object): """ A container object for all of the settings which define a BEASTling analysis. Configuration objects are initialised with default values for all options. """ def __init__(self, basename="beastling", configfile=None, stdin_data=False, prior=False): """ Set all options to their default values and then, if a configuration file has been provided, override the default values for those options set in the file. """ # Options set by the user, with default values self.basename = basename + "_prior" if prior else basename """This will be used as a common prefix for output filenames (e.g. the log will be called basename.log).""" self.calibration_configs = {} """A dictionary whose keys are glottocodes or lowercase Glottolog clade names, and whose values are length-2 tuples of flatoing point dates (lower and upper bounds of 95% credible interval).""" self.chainlength = 10000000 """Number of iterations to run the Markov chain for.""" self.clock_configs = [] """A list of dictionaries, each of which specifies the configuration for a single clock model.""" self.embed_data = False """A list of languages to exclude from the analysis, or a name of a file containing such a list.""" self.exclusions = "" """A boolean value, controlling whether or not to embed data files in the XML.""" self.families = [] """List of families to filter down to, or name of a file containing such a list.""" self.geo_config = {} """A dictionary with keys and values corresponding to a [geography] section in a configuration file.""" self.glottolog_release = '2.7' """A string representing a Glottolog release number.""" self.languages = [] """List of languages to filter down to, or name of a file containing such a list.""" self.location_data = None """Name of a file containing latitude/longitude data.""" self.log_all = False """A boolean value, setting this True is a shortcut for setting log_params, log_probabilities and log_trees True.""" self.log_dp = 4 """An integer value, setting the number of decimal points to use when logging rates, locations, etc. Defaults to 4. Use -1 to enable full precision.""" self.log_every = 0 """An integer indicating how many MCMC iterations should occurr between consecutive log entries.""" self.log_params = False """A boolean value, controlling whether or not to log model parameters.""" self.log_probabilities = True """A boolean value, controlling whether or not to log the prior, likelihood and posterior of the analysis.""" self.log_fine_probs = False """A boolean value, controlling whether or not to log individual components of the prior and likelihood.""" self.log_trees = True """A boolean value, controlling whether or not to log the sampled trees.""" self.log_pure_tree = False """A boolean value, controlling whether or not to log a separate file of the sampled trees with no metadata included.""" self.macroareas = [] """A floating point value, indicated the percentage of datapoints, across ALL models, which a language must have in order to be included in the analysis.""" self.minimum_data = 0.0 """List of Glottolog macro-areas to filter down to, or name of a file containing such a list.""" self.model_configs = [] """A list of dictionaries, each of which specifies the configuration for a single clock model.""" self.monophyly = False """A boolean parameter, controlling whether or not to enforce monophyly constraints derived from Glottolog's classification.""" self.monophyly_start_depth = 0 """Integer; Starting depth in the Glottlog classification hierarchy for monophyly constraints""" self.monophyly_end_depth = None """Integer; Ending depth in the Glottlog classification hierarchy for monophyly constraints""" self.monophyly_levels = sys.maxsize """Integer; Number of levels of the Glottolog classification to include in monophyly constraints.""" self.monophyly_direction = "top_down" """Either the string 'top_down' or 'bottom_up', controlling whether 'monophyly_levels' counts from roots (families) or leaves (languages) of the Glottolog classification.""" self.monophyly_newick = None """Either a Newick tree string or the name of a file containing a Newick tree string which represents the desired monophyly constraints if a classification other than Glottolog is required.""" self.overlap = "union" """Either the string 'union' or the string 'intersection', controlling how to handle multiple datasets with non-equal language sets.""" self.sample_branch_lengths = True """A boolean value, controlling whether or not to estimate tree branch lengths.""" self.sample_from_prior = False """Boolean parameter; if True, data is ignored and the MCMC chain will sample from the prior.""" self.sample_topology = True """A boolean value, controlling whether or not to estimate tree topology.""" self.screenlog = True """A boolean parameter, controlling whether or not to log some basic output to stdout.""" self.starting_tree = "" """A starting tree in Newick format, or the name of a file containing the same.""" self.stdin_data = stdin_data # Glottolog data self.glottolog_loaded = False self.classifications = {} self.glotto_macroareas = {} self.locations = {} # Options set from the command line interface self.prior = prior # Stuff we compute ourselves self.processed = False self.configfile = None self.files_to_embed = [] self.messages = [] self.message_flags = [] if configfile: self.read_from_file(configfile) def read_from_file(self, configfile): """ Read one or several INI-style configuration files and overwrite default option settings accordingly. """ self.configfile = INI(interpolation=None) self.configfile.optionxform = str if isinstance(configfile, dict): self.configfile.read_dict(configfile) else: if isinstance(configfile, six.string_types): configfile = (configfile, ) for conf in configfile: self.configfile.read(conf) p = self.configfile for sec, opts in { 'admin': { 'basename': p.get, 'embed_data': p.getboolean, 'screenlog': p.getboolean, 'log_all': p.getboolean, 'log_dp': p.getint, 'log_every': p.getint, 'log_probabilities': p.getboolean, 'log_fine_probs': p.getboolean, 'log_params': p.getboolean, 'log_trees': p.getboolean, 'log_pure_tree': p.getboolean, 'glottolog_release': p.get, }, 'MCMC': { 'chainlength': p.getint, 'sample_from_prior': p.getboolean, }, 'languages': { 'exclusions': p.get, 'languages': p.get, 'families': p.get, 'macroareas': p.get, 'location_data': p.get, 'overlap': p.get, 'starting_tree': p.get, 'sample_branch_lengths': p.getboolean, 'sample_topology': p.getboolean, 'monophyly_start_depth': p.getint, 'monophyly_end_depth': p.getint, 'monophyly_levels': p.getint, 'monophyly_direction': lambda s, o: p.get(s, o).lower(), }, }.items(): for opt, getter in opts.items(): if p.has_option(sec, opt): setattr(self, opt, getter(sec, opt)) ## MCMC self.sample_from_prior |= self.prior if self.prior and not self.basename.endswith("_prior"): self.basename += "_prior" ## Languages sec = "languages" if self.overlap.lower() not in ("union", "intersection"): # pragma: no cover raise ValueError( "Value for overlap needs to be either 'union', or 'intersection'." ) if p.has_option(sec, "monophyletic"): self.monophyly = p.getboolean(sec, "monophyletic") elif p.has_option(sec, "monophyly"): self.monophyly = p.getboolean(sec, "monophyly") if p.has_option(sec, "monophyly_newick"): value = p.get(sec, "monophyly_newick") if os.path.exists(value): with io.open(value, encoding="UTF-8") as fp: self.monophyly_newick = fp.read() else: self.monophyly_newick = value if p.has_option(sec, 'minimum_data'): self.minimum_data = p.getfloat(sec, "minimum_data") ## Calibration if p.has_section("calibration"): for clade, calibration in p.items("calibration"): self.calibration_configs[clade] = calibration ## Clocks clock_sections = [ s for s in p.sections() if s.lower().startswith("clock") ] for section in clock_sections: self.clock_configs.append(self.get_clock_config(p, section)) ## Models model_sections = [ s for s in p.sections() if s.lower().startswith("model") ] for section in model_sections: self.model_configs.append(self.get_model_config(p, section)) # Geography if p.has_section("geography"): self.geo_config = self.get_geo_config(p, "geography") else: self.geo_config = {} if p.has_section("geo_priors"): if not p.has_section("geography"): raise ValueError( "Config file contains geo_priors section but no geography section." ) self.geo_config["geo_priors"] = {} for clades, klm in p.items("geo_priors"): for clade in clades.split(','): clade = clade.strip() if clade not in self.geo_config["sampling_points"]: self.geo_config["sampling_points"].append(clade) self.geo_config["geo_priors"][clade] = klm sampled_points = self.geo_config.get("sampling_points", []) if [p for p in sampled_points if p.lower() != "root" ] and self.sample_topology and not self.monophyly: self.messages.append( "[WARNING] Geographic sampling and/or prior specified for clades other than root, but tree topology is being sampled without monophyly constraints. BEAST may crash." ) # Make sure analysis is non-empty if not model_sections and not self.geo_config: raise ValueError( "Config file contains no model sections and no geography section." ) def get_clock_config(self, p, section): cfg = { 'name': section[5:].strip(), } for key, value in p[section].items(): if key in ('estimate_mean', 'estimate_rate', 'estimate_variance', 'correlated'): value = p.getboolean(section, key) elif key in ('mean', 'rate', 'variance'): value = p.getfloat(section, key) cfg[key] = value return cfg def get_model_config(self, p, section): cfg = { 'name': section[5:].strip(), 'binarised': None, 'rate_variation': False, 'remove_constant_features': True, } for key, value in p[section].items(): # "binarised" is the canonical name for this option and used everywhere # internally, but "binarized" is accepted in the config file. if key in ('binarised', 'binarized'): value = p.getboolean(section, key) key = 'binarised' if key in ("features", "exclusions"): value = self.handle_file_or_list(value) if key in [ 'ascertained', 'pruned', 'rate_variation', 'remove_constant_features', 'use_robust_eigensystem' ]: value = p.getboolean(section, key) if key in ['minimum_data']: value = p.getfloat(section, key) cfg[key] = value return cfg def get_geo_config(self, p, section): cfg = { 'name': 'geography', 'model': 'geo', 'log_locations': True, 'sampling_points': [], } for key, value in p[section].items(): if key == "log_locations": value = p.getboolean(section, key) elif key == "sampling_points": value = self.handle_file_or_list(value) cfg[key] = value return cfg def process(self): """ Prepares a Configuration object for being passed to the BeastXml constructor. This method checks the values of all options for invalid or ambiguous settings, internal consistency, etc. Information is read from external files as required. If this method returns without raising any exceptions then this should function as a guarantee that a BeastXml object can be instantiated from this Configuration with no problems. """ # Add dependency notice if required if self.monophyly and not self.starting_tree: self.messages.append( "[DEPENDENCY] ConstrainedRandomTree is implemented in the BEAST package BEASTLabs." ) # BEAST can't handle really long chains if self.chainlength > _BEAST_MAX_LENGTH: self.chainlength = _BEAST_MAX_LENGTH self.messages.append( "[INFO] Chain length truncated to %d, as BEAST cannot handle longer chains." % self.chainlength) # If log_every was not explicitly set to some non-zero # value, then set it such that we expect 10,000 log # entries if not self.log_every: # If chainlength < 10000, this results in log_every = zero. # This causes BEAST to die. # So in this case, just log everything. self.log_every = self.chainlength // 10000 or 1 self.load_glottolog_data() self.load_user_geo() self.instantiate_models() self.build_language_filter() self.process_models() self.build_language_list() self.handle_monophyly() self.instantiate_calibrations() # At this point, we can tell whether or not the tree's length units # can be treated as arbitrary self.arbitrary_tree = self.sample_branch_lengths and not self.calibrations # Now we can set the value of the ascertained attribute of each model # Ideally this would happen during process_models, but this is impossible # as set_ascertained() relies upon the value of arbitrary_tree defined above, # which itself depends on process_models(). Ugly... for m in self.models: m.set_ascertained() self.instantiate_clocks() self.link_clocks_to_models() self.starting_tree = self.handle_user_supplied_tree( self.starting_tree, "starting") self.processed = True # Decide whether or not to log trees if (self.starting_tree and not self.sample_topology and not self.sample_branch_lengths and all([c.is_strict for c in self.clocks if c.is_used])): self.tree_logging_pointless = True self.messages.append( "[INFO] Tree logging disabled because starting tree is known and fixed and all clocks are strict." ) else: self.tree_logging_pointless = False def load_glottolog_data(self): """ Loads the Glottolog classification information from the appropriate newick file, parses it and stores the required datastructure in self.classification. """ # Don't load if the analysis doesn't use it if not self.check_glottolog_required(): return # Don't load if we already have - can this really happen? if self.glottolog_loaded: return self.glottolog_loaded = True label2name = {} glottocode2node = {} def parse_label(label): match = GLOTTOLOG_NODE_LABEL.match(label) label2name[label] = (match.group('name').strip().replace( "\\'", "'"), match.group('glottocode')) return (match.group('name').strip(), match.group('glottocode'), match.group('isocode')) def get_classification(node): res = [] ancestor = node.ancestor while ancestor: res.append(label2name[ancestor.name]) ancestor = ancestor.ancestor return list(reversed(res)) # Walk the tree and build the classifications dictionary glottolog_trees = newick.read( get_glottolog_data('newick', self.glottolog_release)) for tree in glottolog_trees: for node in tree.walk(): name, glottocode, isocode = parse_label(node.name) classification = get_classification(node) self.classifications[glottocode] = classification if isocode: self.classifications[isocode] = classification glottocode2node[glottocode] = node # Load geographic metadata for t in reader(get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.macroarea: self.glotto_macroareas[t.glottocode] = t.macroarea for isocode in t.isocodes.split(): self.glotto_macroareas[isocode] = t.macroarea if self.location_data: continue # Use user-supplied data instead if t.latitude and t.longitude: latlon = (float(t.latitude), float(t.longitude)) self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon if self.location_data: return # Second pass of geographic data to handle dialects, which inherit # their parent language's location for t in reader(get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.level == "dialect": failed = False node = glottocode2node[t.glottocode] ancestor = node.ancestor while label2name[ancestor.name][1] not in self.locations: if not ancestor.ancestor: # We've hit the root without finding an ancestral node # with location data! failed = True break else: ancestor = ancestor.ancestor if failed: continue latlon = self.locations[label2name[ancestor.name][1]] self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon def check_glottolog_required(self): # We need Glottolog if... return ( # ...we've been given a list of families self.families # ...we've been given a list of macroareas or self.macroareas # ...we're using monophyly constraints or self.monophyly # ...we're using calibrations (well, sometimes) or self.calibration_configs # ...we're using geography or self.geo_config) def load_user_geo(self): if not self.location_data: return with io.open(self.location_data, encoding="UTF-8") as fp: # Skip header fp.readline() for line in fp: iso, lat, lon = line.split(",") self.locations[iso.strip().lower()] = float(lat), float(lon) def build_language_filter(self): """ Examines the values of various options, including self.languages and self.families, and constructs self.lang_filter. self.lang_filter is a Set object containing all ISO and glotto codes which are compatible with the provided settings (e.g. belong to the requested families). This set is later used as a mask with data sets. Datapoints with language identifiers not in this set will not be used in an analysis. """ # Load requirements self.languages = self.handle_file_or_list(self.languages) if len(self.families) == 1: self.messages.append( """[WARNING] value of 'families' has length 1: have you misspelled a filename?""" ) self.families = self.handle_file_or_list(self.families) self.exclusions = set(self.handle_file_or_list(self.exclusions)) self.macroareas = self.handle_file_or_list(self.macroareas) # Enforce minimum data constraint all_langs = set( itertools.chain(*[model.data.keys() for model in self.models])) N = sum([ max([len(lang.keys()) for lang in model.data.values()]) for model in self.models ]) datapoint_props = {} for lang in all_langs: count = 0 for model in self.models: count += len( [x for x in model.data[lang].values() if x != "?"]) datapoint_props[lang] = 1.0 * count / N self.sparse_languages = [ l for l in all_langs if datapoint_props[l] < self.minimum_data ] def handle_file_or_list(self, value): if not (isinstance(value, list) or isinstance(value, set)): if os.path.exists(value): with io.open(value, encoding="UTF-8") as fp: result = [x.strip() for x in fp.readlines()] self.files_to_embed.append(value) else: result = [x.strip() for x in value.split(",")] else: result = value return result def filter_language(self, l): if self.languages and l not in self.languages: return False if self.families and not any([ name in self.families or glottocode in self.families for (name, glottocode) in self.classifications.get(l, []) ]): return False if self.macroareas and self.glotto_macroareas.get( l, None) not in self.macroareas: return False if self.exclusions and l in self.exclusions: return False if self.geo_config and l not in self.locations: self.messages.append( """[INFO] All models: Language %s excluded due to lack of location data.""" % l) return False if l in self.sparse_languages: return False return True def handle_monophyly(self): """ Construct a representation of the Glottolog monophyly constraints for the languages in self.languages. If the constraints are meaningful, create and store a Newick tree representation of them. If the constraints are not meaningful, e.g. all languages are classified identically by Glottolog, then override the monophyly=True setting. """ if not self.monophyly: return if len(self.languages) < 3: # Monophyly constraints are meaningless for so few languages self.monophyly = False self.messages.append( """[INFO] Disabling Glottolog monophyly constraints because there are only %d languages in analysis.""" % len(self.languages)) return if self.monophyly_newick: # The user has provided a tree, so no need to build our own self.monophyly_newick = self.handle_user_supplied_tree( self.monophyly_newick, "monophyly") return # Build a list-based representation of the Glottolog monophyly constraints # This can be done in either a "top-down" or "bottom-up" way. langs = [ l for l in self.languages if l.lower() in self.classifications ] if self.monophyly_end_depth is not None: # A power user has explicitly provided start and end depths start = self.monophyly_start_depth end = self.monophyly_end_depth elif self.monophyly_direction == "top_down": # Compute start and end in a top-down fashion start = self.monophyly_start_depth end = start + self.monophyly_levels elif self.monophyly_direction == "bottom_up": # Compute start and end in a bottom-up fashion classifications = [ self.classifications[name.lower()] for name in langs ] end = max([len(c) for c in classifications]) - self.monophyly_start_depth start = max(0, end - self.monophyly_levels) struct = self.make_monophyly_structure(langs, depth=start, maxdepth=end) # Make sure this struct is not pointlessly flat if not self.check_monophyly_structure(struct): self.monophyly = False self.messages.append( """[INFO] Disabling Glottolog monophyly constraints because all languages in the analysis are classified identically.""" ) # At this point everything looks good, so keep monophyly on and serialise the "monophyly structure" into a Newick tree. self.monophyly_newick = self.make_monophyly_string(struct) def make_monophyly_structure(self, langs, depth, maxdepth): """ Recursively partition a list of languages (ISO or Glottocodes) into lists corresponding to their Glottolog classification. The process may be halted part-way down the Glottolog tree. """ if depth > maxdepth: # We're done, so terminate recursion return langs def subgroup(name, depth): ancestors = self.classifications[name.lower()] return ancestors[depth][0] if depth < len(ancestors) else '' def sortkey(i): """ Callable to pass into `sorted` to port sorting behaviour from py2 to py3. :param i: Either a string or a list (of lists, ...) of strings. :return: Pair (nesting level, first string) """ d = 0 while isinstance(i, list): d -= 1 i = i[0] if i else '' return d, i # Find the ancestor of all the given languages at at particular depth # (i.e. look `depth` nodes below the root of the Glottolog tree) levels = list(set([subgroup(l, depth) for l in langs])) if len(levels) == 1: # If all languages belong to the same classificatio at this depth, # there are two possibilities if levels[0] == "": # If the common classification is an empty string, then we know # that there is no further refinement possible, so stop # the recursion here. langs.sort() return langs else: # If the common classification is non-empty, we need to # descend further, since some languages might get # separated later return self.make_monophyly_structure(langs, depth + 1, maxdepth) else: # If the languages belong to multiple classifications, split them # up accordingly and then break down each classification # individually. partition = [[l for l in langs if subgroup(l, depth) == level] for level in levels] partition = [part for part in partition if part] return sorted([ self.make_monophyly_structure(group, depth + 1, maxdepth) for group in partition ], key=sortkey) def check_monophyly_structure(self, struct): """ Return True if the monophyly structure represented by struct is considered "meaningful", i.e. encodes something other than an unstructured polytomy. """ # TODO: Make this more rigorous. # Current test will fail ['foo', 'bar', 'baz'], but # will pass [['foo'], ['bar'], ['baz']], which is no better. if not any([type(x) == list for x in struct]): # Struct is just a list of language names, with no internal structure return False return True def make_monophyly_string(self, struct, depth=0): """ Converts a structure of nested lists into Newick string. """ if not type([]) in [type(x) for x in struct]: return "(%s)" % ",".join(struct) if len(struct) > 1 else struct[0] else: return "(%s)" % ",".join([ self.make_monophyly_string(substruct) for substruct in struct ]) def instantiate_clocks(self): """ Populates self.clocks with a list of BaseClock subclasses, one for each dictionary of settings in self.clock_configs. """ self.clocks = [] self.clocks_by_name = {} for config in self.clock_configs: if config["type"].lower() == "strict": clock = strict.StrictClock(config, self) elif config["type"].lower() == "relaxed": clock = relaxed.relaxed_clock_factory(config, self) elif config["type"].lower() == "random": clock = random.RandomLocalClock(config, self) self.clocks.append(clock) self.clocks_by_name[clock.name] = clock # Create default clock if necessary if "default" not in self.clocks_by_name: config = {} config["name"] = "default" config["type"] = "strict" clock = strict.StrictClock(config, self) self.clocks.append(clock) self.clocks_by_name[clock.name] = clock def instantiate_models(self): """ Populates self.models with a list of BaseModel subclasses, one for each dictionary of settings in self.model_configs. """ if not (self.model_configs or self.geo_config): raise ValueError("No models or geography specified!") # Handle request to read data from stdin if self.stdin_data: for config in self.model_configs: config["data"] = "stdin" self.models = [] for config in self.model_configs: # Validate config if "model" not in config: raise ValueError("Model not specified for model section %s." % config["name"]) if "data" not in config: raise ValueError( "Data source not specified in model section %s." % config["name"]) # Instantiate model if config["model"].lower() == "bsvs": model = bsvs.BSVSModel(config, self) if "bsvs_used" not in self.message_flags: self.message_flags.append("bsvs_used") self.messages.append(bsvs.BSVSModel.package_notice) elif config["model"].lower() == "covarion": model = covarion.CovarionModel(config, self) elif config["model"].lower() == "mk": model = mk.MKModel(config, self) if "mk_used" not in self.message_flags: self.message_flags.append("mk_used") self.messages.append(mk.MKModel.package_notice) else: try: sys.path.insert(0, os.getcwd()) module_path, class_name = config["model"].rsplit(".", 1) module = importlib.import_module(module_path) UserClass = getattr(module, class_name) except: raise ValueError( "Unknown model type '%s' for model section '%s', and failed to import a third-party model." % (config["model"], config["name"])) model = UserClass(config, self) self.messages.extend(model.messages) self.models.append(model) if self.geo_config: self.geo_model = geo.GeoModel(self.geo_config, self) self.messages.extend(self.geo_model.messages) self.all_models = [self.geo_model] + self.models else: self.all_models = self.models def process_models(self): for model in self.models: model.process() def link_clocks_to_models(self): """ Ensures that for each model object in self.models, the attribute "clock" is a reference to one of the clock objects in self.clocks. Also determine which clock to estimate the mean of. """ for model in self.all_models: if model.clock: # User has explicitly specified a clock if model.clock not in self.clocks_by_name: raise ValueError( "Unknown clock '%s' for model section '%s'." % (model.clock, model.name)) model.clock = self.clocks_by_name[model.clock] elif model.name in self.clocks_by_name: # Clock is associated by a common name model.clock = self.clocks_by_name[model.name] else: # No clock specification - use default model.clock = self.clocks_by_name["default"] model.clock.is_used = True # Disable pruned trees in models using RLCs for model in self.models: if model.pruned and isinstance(model.clock, random.RandomLocalClock): model.pruned = False self.messages.append( """[INFO] Disabling pruned trees in model %s because associated clock %s is a RandomLocalClock. Pruned trees are currently only compatible with StrictClocks and RelaxedClocks.""" % (model.name, model.clock.name)) # Warn user about unused clock(s) (but not the default clock) for clock in self.clocks: if clock.name != "default" and not clock.is_used: self.messages.append( """[INFO] Clock %s is not being used. Change its name to "default", or explicitly associate it with a model.""" % clock.name) # Remove unused clocks from the master clock list self.clocks = [c for c in self.clocks if c.is_used] # Get a list of model (i.e. non-geo) clocks for which the user has not # indicated a preference on whether the mean should be estimated free_clocks = list( set([ m.clock for m in self.models if m.clock.is_used and m.clock.estimate_rate == None ])) if free_clocks: # To begin with, estimate all free clocks for clock in free_clocks: clock.estimate_rate = True # But if the tree is arbitrary, then fix one free clock, unless the # user has fixed an un-free clock if self.arbitrary_tree and all( [m.clock.estimate_rate for m in self.models]): free_clocks[0].estimate_rate = False self.messages.append( """[INFO] Clock "%s" has had it's mean rate fixed to 1.0. Tree branch lengths are in units of expected substitutions for features in models using this clock.""" % free_clocks[0].name) # Determine whether or not precision-scaling is required if self.geo_config: self.geo_model.scale_precision = False geo_clock = self.geo_model.clock for m in self.models: if m.clock == geo_clock: self.messages.append( """[WARNING] Geography model is sharing a clock with one or more data models. This may lead to a bad fit.""" ) self.geo_model.scale_precision = True break # If geo has it's own clock, estimate the mean if not self.geo_model.scale_precision: self.geo_model.clock.estimate_rate = True def build_language_list(self): """ Combines the language sets of each model's data set, according to the value of self.overlap, to construct a final list of all the languages in the analysis. """ if self.models: self.languages = set(self.models[0].data.keys()) else: # There are no models # So this must be a geography-only analysis # Start with all languages in Glottolog, then apply filters self.languages = [ l for l in self.classifications if self.filter_language(l) ] self.overlap_warning = False for model in self.models: addition = set(model.data.keys()) # If we're about to do a non-trivial union/intersect, alert the # user. if addition != self.languages and not self.overlap_warning: self.messages.append( """[INFO] Not all data files have equal language sets. BEASTling will use the %s of all language sets. Set the "overlap" option in [languages] to change this.""" % self.overlap.lower()) self.overlap_warning = True if self.overlap.lower() == "union": self.languages = set.union(self.languages, addition) elif self.overlap.lower() == "intersection": self.languages = set.intersection(self.languages, addition) ## Make sure there's *something* left if not self.languages: raise ValueError("No languages specified!") ## Convert back into a sorted list self.languages = sorted(self.languages) self.messages.append("[INFO] %d languages included in analysis." % len(self.languages)) def instantiate_calibrations(self): self.calibrations = {} useless_calibrations = [] for clade, cs in self.calibration_configs.items(): orig_clade = clade[:] orig_cs = cs[:] originate = False # First parse the clade identifier # Might be "root", or else a Glottolog identifier if clade.lower() == "root": langs = self.languages else: # First check for originate() if clade.lower().startswith("originate(") and clade.endswith( ")"): originate = True clade = clade[10:-1] langs = self.get_languages_by_glottolog_clade(clade) if not langs or (len(langs) == 1 and not originate): self.messages.append( "[INFO] Calibration on clade %s MRCA ignored as one or zero matching languages in analysis." % clade) continue # Next parse the calibration string if cs.count("(") == 1 and cs.count(")") == 1: dist_type, cs = cs.split("(", 1) dist_type = dist_type.lower() cs = cs[0:-1] else: # Default to normal dist_type = "normal" if cs.count(",") == 1 and not any([x in cs for x in ("<", ">")]): # We've got explicit params p1, p2 = map(float, cs.split(",")) elif cs.count("-") == 1 and not any( [x in cs for x in (",", "<", ">")]): # We've got a 95% HPD range lower, upper = map(float, cs.split("-")) mid = (lower + upper) / 2.0 if dist_type == "normal": p1 = (upper + lower) / 2.0 p2 = (upper - mid) / 1.96 elif dist_type == "lognormal": p1 = math.log(mid) p2a = (p1 - math.log(lower)) / 1.96 p2b = (math.log(upper) - p1) / 1.96 p2 = (p2a + p2b) / 2.0 elif dist_type == "uniform": p1 = lower p2 = upper elif (cs.count("<") == 1 or cs.count(">") == 1) and not any([x in cs for x in (",", "-")]): # We've got a single bound dist_type = "uniform" sign, bound = cs.split() if sign.strip() == "<": p1 = 0.0 p2 = float(bound.strip()) else: p1 = float(bound.strip()) p2 = str(sys.maxsize) else: raise ValueError( "Could not parse calibration \"%s\" for clade %s" % (orig_cs, orig_clade)) clade_identifier = "%s_originate" % clade if originate else clade self.calibrations[clade_identifier] = Calibration( langs, originate, dist_type, p1, p2) def get_languages_by_glottolog_clade(self, clade): langs = [] clade = [c.strip() for c in clade.split(",")] for l in self.languages: if l in clade: langs.append(l) continue for name, glottocode in self.classifications.get(l.lower(), ""): if any([ c.lower() == name.lower() or c.lower() == glottocode for c in clade ]): langs.append(l) break return langs def handle_user_supplied_tree(self, value, tree_type): """ If the provided value is a filename, read the contents and treat it as a Newick tree specification. Otherwise, assume the provided value is a Neick tree specification. In either case, inspect the tree and make appropriate minor changes so it is suitable for inclusion in the BEAST XML file. """ # Make sure we've got a legitimate tree type tree_type = tree_type.lower() if tree_type not in ("starting", "monophyly"): raise ValueError( "Valid tree types for sanitising are 'starting' and 'monophyly', not %s." % tree_type) # Read from file if necessary if os.path.exists(value): with io.open(value, encoding="UTF-8") as fp: value = fp.read().strip() # Sanitise if value: value = self.sanitise_tree(value, tree_type) # Done return value def sanitise_tree(self, tree, tree_type): """ Makes any changes to a user-provided tree required to make it suitable for passing to BEAST. In particular, this method checks that the supplied string or the contents of the supplied file: * seems to be a valid Newick tree * contains no duplicate taxa * has taxa which are a superset of the languages in the analysis * has no polytomies or unifurcations. """ # Make sure tree can be parsed try: tree = newick.loads(tree)[0] except: raise ValueError("Could not parse %s tree. Is it valid Newick?" % tree_type) # Make sure starting tree contains no duplicate taxa tree_langs = tree.get_leaf_names() if not len(set(tree_langs)) == len(tree_langs): dupes = set([l for l in tree_langs if tree_langs.count(l) > 1]) dupestring = ",".join( ["%s (%d)" % (d, tree_langs.count(d)) for d in dupes]) raise ValueError("%s tree contains duplicate taxa: %s" % (tree_type.capitalize(), dupestring)) tree_langs = set(tree_langs) # Make sure languges in tree is a superset of languages in the analysis if not tree_langs.issuperset(self.languages): missing_langs = set(self.languages).difference(tree_langs) miss_string = ",".join(missing_langs) raise ValueError( "Some languages in the data are not in the %s tree: %s" % (tree_type, miss_string)) # If the trees' language set is a proper superset, prune the tree to fit the analysis if not tree_langs == self.languages: tree.prune_by_names(self.languages, inverse=True) self.messages.append( "[INFO] %s tree includes languages not present in any data set and will be pruned." % tree_type.capitalize()) # Get the tree looking nice tree.remove_redundant_nodes() if tree_type == "starting": tree.resolve_polytomies() # Remove lengths for a monophyly tree if tree_type == "monophyly": for n in tree.walk(): n._length = None # Checks if tree_type == "starting": assert all([len(n.descendants) in (0, 2) for n in tree.walk()]) assert len(tree.get_leaves()) == len(self.languages) assert all([l.name for l in tree.get_leaves()]) # Done return newick.dumps(tree)
def read_from_file(self, configfile): """ Read one or several INI-style configuration files and overwrite default option settings accordingly. """ self.configfile = INI(interpolation=None) if isinstance(configfile, dict): self.configfile.read_dict(configfile) else: if isinstance(configfile, six.string_types): configfile = (configfile,) for conf in configfile: self.configfile.read(conf) p = self.configfile for sec, opts in { 'admin': { 'basename': p.get, 'embed_data': p.getboolean, 'screenlog': p.getboolean, 'log_every': p.getint, 'log_all': p.getboolean, 'log_probabilities': p.getboolean, 'log_fine_probs': p.getboolean, 'log_params': p.getboolean, 'log_trees': p.getboolean, 'log_pure_tree': p.getboolean, 'glottolog_release': p.get, }, 'MCMC': { 'chainlength': p.getint, 'sample_from_prior': p.getboolean, }, 'languages': { 'exclusions': p.get, 'languages': p.get, 'families': p.get, 'macroareas': p.get, 'location_data': p.get, 'overlap': p.get, 'starting_tree': p.get, 'sample_branch_lengths': p.getboolean, 'sample_topology': p.getboolean, 'monophyly_start_depth': p.getint, 'monophyly_end_depth': p.getint, 'monophyly_levels': p.getint, 'monophyly_direction': lambda s, o: p.get(s, o).lower(), }, }.items(): for opt, getter in opts.items(): if p.has_option(sec, opt): setattr(self, opt, getter(sec, opt)) ## MCMC self.sample_from_prior |= self.prior if self.prior and not self.basename.endswith("_prior"): self.basename += "_prior" ## Languages sec = "languages" if self.overlap.lower() not in ("union", "intersection"): # pragma: no cover raise ValueError( "Value for overlap needs to be either 'union', or 'intersection'." ) if p.has_option(sec, "monophyletic"): self.monophyly = p.getboolean(sec, "monophyletic") elif p.has_option(sec, "monophyly"): self.monophyly = p.getboolean(sec, "monophyly") ## Calibration if p.has_section("calibration"): for clade, calibration in p.items("calibration"): self.calibration_configs[clade] = calibration ## Clocks clock_sections = [s for s in p.sections() if s.lower().startswith("clock")] for section in clock_sections: self.clock_configs.append(self.get_clock_config(p, section)) ## Models model_sections = [s for s in p.sections() if s.lower().startswith("model")] for section in model_sections: self.model_configs.append(self.get_model_config(p, section)) # Geography if p.has_section("geography"): self.geo_config = self.get_geo_config(p, "geography") else: self.geo_config = {} # Make sure analysis is non-empty if not model_sections and not self.geo_config: raise ValueError("Config file contains no model sections and no geography section.")
class Configuration(object): """ A container object for all of the settings which define a BEASTling analysis. Configuration objects are initialised with default values for all options. """ def __init__(self, basename="beastling", configfile=None, stdin_data=False, prior=False): """ Set all options to their default values and then, if a configuration file has been provided, override the default values for those options set in the file. """ # Options set by the user, with default values self.basename = basename+"_prior" if prior else basename """This will be used as a common prefix for output filenames (e.g. the log will be called basename.log).""" self.calibration_configs = {} """A dictionary whose keys are glottocodes or lowercase Glottolog clade names, and whose values are length-2 tuples of flatoing point dates (lower and upper bounds of 95% credible interval).""" self.chainlength = 10000000 """Number of iterations to run the Markov chain for.""" self.clock_configs = [] """A list of dictionaries, each of which specifies the configuration for a single clock model.""" self.embed_data = False """A list of languages to exclude from the analysis, or a name of a file containing such a list.""" self.exclusions = "" """A boolean value, controlling whether or not to embed data files in the XML.""" self.families = [] """List of families to filter down to, or name of a file containing such a list.""" self.geo_config = {} """A dictionary with keys and values corresponding to a [geography] section in a configuration file.""" self.glottolog_release = '2.7' """A string representing a Glottolog release number.""" self.languages = [] """List of languages to filter down to, or name of a file containing such a list.""" self.location_data = None """Name of a file containing latitude/longitude data.""" self.log_all = False """A boolean value, setting this True is a shortcut for setting log_params, log_probabilities and log_trees True.""" self.log_every = 0 """An integer indicating how many MCMC iterations should occurr between consecutive log entries.""" self.log_params = False """A boolean value, controlling whether or not to log model parameters.""" self.log_probabilities = True """A boolean value, controlling whether or not to log the prior, likelihood and posterior of the analysis.""" self.log_fine_probs = True """A boolean value, controlling whether or not to log individuaal components of the prior and likelihood,.""" self.log_trees = True """A boolean value, controlling whether or not to log the sampled trees.""" self.log_pure_tree = False """A boolean value, controlling whether or not to log a separate file of the sampled trees with no metadata included.""" self.macroareas = [] """List of Glottolog macro-areas to filter down to, or name of a file containing such a list.""" self.model_configs = [] """A list of dictionaries, each of which specifies the configuration for a single clock model.""" self.monophyly = False """A boolean parameter, controlling whether or not to enforce monophyly constraints derived from Glottolog's classification.""" self.monophyly_start_depth = 0 """Integer; Starting depth in the Glottlog classification hierarchy for monophyly constraints""" self.monophyly_end_depth = None """Integer; Ending depth in the Glottlog classification hierarchy for monophyly constraints""" self.monophyly_levels = sys.maxsize """Integer; Number of levels of the Glottolog classification to include in monophyly constraints.""" self.monophyly_direction = "top_down" """Either the string 'top_down' or 'bottom_up', controlling whether 'monophyly_levels' counts from roots (families) or leaves (languages) of the Glottolog classification.""" self.overlap = "union" """Either the string 'union' or the string 'intersection', controlling how to handle multiple datasets with non-equal language sets.""" self.sample_branch_lengths = True """A boolean value, controlling whether or not to estimate tree branch lengths.""" self.sample_from_prior = False """Boolean parameter; if True, data is ignored and the MCMC chain will sample from the prior.""" self.sample_topology = True """A boolean value, controlling whether or not to estimate tree topology.""" self.screenlog = True """A boolean parameter, controlling whether or not to log some basic output to stdout.""" self.starting_tree = "" """A starting tree in Newick format, or the name of a file containing the same.""" self.stdin_data = stdin_data # Glottolog data self.glottolog_loaded = False self.classifications = {} self.glotto_macroareas = {} self.locations = {} # Options set from the command line interface self.prior = prior # Stuff we compute ourselves self.processed = False self.configfile = None self.files_to_embed = [] self.messages = [] self.message_flags = [] if configfile: self.read_from_file(configfile) def read_from_file(self, configfile): """ Read one or several INI-style configuration files and overwrite default option settings accordingly. """ self.configfile = INI(interpolation=None) if isinstance(configfile, dict): self.configfile.read_dict(configfile) else: if isinstance(configfile, six.string_types): configfile = (configfile,) for conf in configfile: self.configfile.read(conf) p = self.configfile for sec, opts in { 'admin': { 'basename': p.get, 'embed_data': p.getboolean, 'screenlog': p.getboolean, 'log_every': p.getint, 'log_all': p.getboolean, 'log_probabilities': p.getboolean, 'log_fine_probs': p.getboolean, 'log_params': p.getboolean, 'log_trees': p.getboolean, 'log_pure_tree': p.getboolean, 'glottolog_release': p.get, }, 'MCMC': { 'chainlength': p.getint, 'sample_from_prior': p.getboolean, }, 'languages': { 'exclusions': p.get, 'languages': p.get, 'families': p.get, 'macroareas': p.get, 'location_data': p.get, 'overlap': p.get, 'starting_tree': p.get, 'sample_branch_lengths': p.getboolean, 'sample_topology': p.getboolean, 'monophyly_start_depth': p.getint, 'monophyly_end_depth': p.getint, 'monophyly_levels': p.getint, 'monophyly_direction': lambda s, o: p.get(s, o).lower(), }, }.items(): for opt, getter in opts.items(): if p.has_option(sec, opt): setattr(self, opt, getter(sec, opt)) ## MCMC self.sample_from_prior |= self.prior if self.prior and not self.basename.endswith("_prior"): self.basename += "_prior" ## Languages sec = "languages" if self.overlap.lower() not in ("union", "intersection"): # pragma: no cover raise ValueError( "Value for overlap needs to be either 'union', or 'intersection'." ) if p.has_option(sec, "monophyletic"): self.monophyly = p.getboolean(sec, "monophyletic") elif p.has_option(sec, "monophyly"): self.monophyly = p.getboolean(sec, "monophyly") ## Calibration if p.has_section("calibration"): for clade, calibration in p.items("calibration"): self.calibration_configs[clade] = calibration ## Clocks clock_sections = [s for s in p.sections() if s.lower().startswith("clock")] for section in clock_sections: self.clock_configs.append(self.get_clock_config(p, section)) ## Models model_sections = [s for s in p.sections() if s.lower().startswith("model")] for section in model_sections: self.model_configs.append(self.get_model_config(p, section)) # Geography if p.has_section("geography"): self.geo_config = self.get_geo_config(p, "geography") else: self.geo_config = {} # Make sure analysis is non-empty if not model_sections and not self.geo_config: raise ValueError("Config file contains no model sections and no geography section.") def get_clock_config(self, p, section): cfg = { 'name': section[5:].strip(), } for key, value in p[section].items(): if key == 'estimate_mean': value = p.getboolean(section, key) cfg[key] = value return cfg def get_model_config(self, p, section): cfg = { 'name': section[5:].strip(), 'binarised': None, 'rate_variation': False, 'remove_constant_features': True, } for key, value in p[section].items(): # "binarised" is the canonical name for this option and used everywhere # internally, but "binarized" is accepted in the config file. if key in ('binarised', 'binarized'): value = p.getboolean(section, key) key = 'binarised' if key == "features": value = self.handle_file_or_list(value) if key in ['pruned','rate_variation', 'remove_constant_features']: value = p.getboolean(section, key) if key in ['minimum_data']: value = p.getfloat(section, key) cfg[key] = value return cfg def get_geo_config(self, p, section): cfg = { 'name': 'geography', 'model': 'geo', 'log_locations': True, } for key, value in p[section].items(): if key == "log_locations": value = p.getboolean(section, key) cfg[key] = value return cfg def process(self): """ Prepares a Configuration object for being passed to the BeastXml constructor. This method checks the values of all options for invalid or ambiguous settings, internal consistency, etc. Information is read from external files as required. If this method returns without raising any exceptions then this should function as a guarantee that a BeastXml object can be instantiated from this Configuration with no problems. """ # Add dependency notice if required if self.monophyly and not self.starting_tree: self.messages.append("[DEPENDENCY] ConstrainedRandomTree is implemented in the BEAST package BEASTLabs.") # BEAST can't handle really long chains if self.chainlength > _BEAST_MAX_LENGTH: self.chainlength = _BEAST_MAX_LENGTH self.messages.append("[INFO] Chain length truncated to %d, as BEAST cannot handle longer chains." % self.chainlength) # If log_every was not explicitly set to some non-zero # value, then set it such that we expect 10,000 log # entries if not self.log_every: # If chainlength < 10000, this results in log_every = zero. # This causes BEAST to die. # So in this case, just log everything. self.log_every = self.chainlength // 10000 or 1 self.load_glottolog_data() self.load_user_geo() self.build_language_filter() self.instantiate_models() self.build_language_list() self.handle_monophyly() self.instantiate_calibrations() # At this point, we can tell whether or not the tree's length units # can be treated as arbitrary self.arbitrary_tree = self.sample_branch_lengths and not self.calibrations self.instantiate_clocks() self.link_clocks_to_models() self.handle_starting_tree() self.processed = True # Decide whether or not to log trees if ( self.starting_tree and not self.sample_topology and not self.sample_branch_lengths and all([c.is_strict for c in self.clocks if c.is_used]) ): self.tree_logging_pointless = True self.messages.append( "[INFO] Tree logging disabled because starting tree is known and fixed and all clocks are strict.") else: self.tree_logging_pointless = False def load_glottolog_data(self): """ Loads the Glottolog classification information from the appropriate newick file, parses it and stores the required datastructure in self.classification. """ # Don't load if the analysis doesn't use it if not self.check_glottolog_required(): return # Don't load if we already have - can this really happen? if self.glottolog_loaded: return self.glottolog_loaded = True label2name = {} glottocode2node = {} def parse_label(label): match = GLOTTOLOG_NODE_LABEL.match(label) label2name[label] = (match.group('name').strip().replace("\\'","'"), match.group('glottocode')) return ( match.group('name').strip(), match.group('glottocode'), match.group('isocode')) def get_classification(node): res = [] ancestor = node.ancestor while ancestor: res.append(label2name[ancestor.name]) ancestor = ancestor.ancestor return list(reversed(res)) # Walk the tree and build the classifications dictionary glottolog_trees = newick.read(get_glottolog_data('newick', self.glottolog_release)) for tree in glottolog_trees: for node in tree.walk(): name, glottocode, isocode = parse_label(node.name) classification = get_classification(node) self.classifications[glottocode] = classification if isocode: self.classifications[isocode] = classification glottocode2node[glottocode] = node # Load geographic metadata for t in reader( get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.macroarea: self.glotto_macroareas[t.glottocode] = t.macroarea for isocode in t.isocodes.split(): self.glotto_macroareas[isocode] = t.macroarea if self.location_data: continue # Use user-supplied data instead if t.latitude and t.longitude: latlon = (float(t.latitude), float(t.longitude)) self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon if self.location_data: return # Second pass of geographic data to handle dialects, which inherit # their parent language's location for t in reader( get_glottolog_data('geo', self.glottolog_release), namedtuples=True): if t.level == "dialect": failed = False node = glottocode2node[t.glottocode] ancestor = node.ancestor while label2name[ancestor.name][1] not in self.locations: if not ancestor.ancestor: # We've hit the root without finding an ancestral node # with location data! failed = True break else: ancestor = ancestor.ancestor if failed: continue latlon = self.locations[label2name[ancestor.name][1]] self.locations[t.glottocode] = latlon for isocode in t.isocodes.split(): self.locations[isocode] = latlon def check_glottolog_required(self): # We need Glottolog if... return ( # ...we've been given a list of families self.families # ...we've been given a list of macroareas or self.macroareas # ...we're using monophyly constraints or self.monophyly # ...we're using calibrations (well, sometimes) or self.calibration_configs # ...we're using geography or self.geo_config ) def load_user_geo(self): if not self.location_data: return with io.open(self.location_data, encoding="UTF-8") as fp: # Skip header fp.readline() for line in fp: iso, lat, lon = line.split(",") self.locations[iso.strip().lower()] = map(float, (lat, lon)) def build_language_filter(self): """ Examines the values of various options, including self.languages and self.families, and constructs self.lang_filter. self.lang_filter is a Set object containing all ISO and glotto codes which are compatible with the provided settings (e.g. belong to the requested families). This set is later used as a mask with data sets. Datapoints with language identifiers not in this set will not be used in an analysis. """ # Load requirements self.languages = self.handle_file_or_list(self.languages) if len(self.families) == 1: self.messages.append("""[WARNING] value of 'families' has length 1: have you misspelled a filename?""") self.families = self.handle_file_or_list(self.families) self.exclusions = set(self.handle_file_or_list(self.exclusions)) self.macroareas = self.handle_file_or_list(self.macroareas) def handle_file_or_list(self, value): if not (isinstance(value, list) or isinstance(value, set)): if os.path.exists(value): with io.open(value, encoding="UTF-8") as fp: result = [x.strip() for x in fp.readlines()] self.files_to_embed.append(value) else: result = [x.strip() for x in value.split(",")] else: result = value return result def filter_language(self, l): if self.languages and l not in self.languages: return False if self.families and not any([name in self.families or glottocode in self.families for (name, glottocode) in self.classifications.get(l,[])]): return False if self.macroareas and self.glotto_macroareas.get(l,None) not in self.macroareas: return False if self.exclusions and l in self.exclusions: return False if self.geo_config and l not in self.locations: self.messages.append("""[INFO] All models: Language %s excluded due to lack of loation data.""" % l) return False return True def handle_monophyly(self): """ Construct a representation of the Glottolog monophyly constraints for the languages in self.languages. If the constraints are meaningful, create and store a Newick tree representation of them. If the constraints are not meaningful, e.g. all languages are classified identically by Glottolog, then override the monophyly=True setting. """ if not self.monophyly: return if len(self.languages) < 3: # Monophyly constraints are meaningless for so few languages self.monophyly = False self.messages.append("""[INFO] Disabling Glottolog monophyly constraints because there are only %d languages in analysis.""" % len(self.languages)) # Build a list-based representation of the monophyly constraints # This can be done in either a "top-down" or "bottom-up" way. langs = [l for l in self.languages if l.lower() in self.classifications] if self.monophyly_end_depth is not None: # A power user has explicitly provided start and end depths start = self.monophyly_start_depth end = self.monophyly_end_depth elif self.monophyly_direction == "top_down": # Compute start and end in a top-down fashion start = self.monophyly_start_depth end = start + self.monophyly_levels elif self.monophyly_direction == "bottom_up": # Compute start and end in a bottom-up fashion classifications = [self.classifications[name.lower()] for name in langs] end = max([len(c) for c in classifications]) - self.monophyly_start_depth start = max(0, end - self.monophyly_levels) struct = self.make_monophyly_structure(langs, depth=start, maxdepth=end) # Make sure this struct is not pointlessly flat if not self.check_monophyly_structure(struct): self.monophyly = False self.messages.append("""[INFO] Disabling Glottolog monophyly constraints because all languages in the analysis are classified identically.""") # At this point everything looks good, so keep monophyly on and serialise the "monophyly structure" into a Newick tree. self.monophyly_newick = self.make_monophyly_string(struct) def make_monophyly_structure(self, langs, depth, maxdepth): """ Recursively partition a list of languages (ISO or Glottocodes) into lists corresponding to their Glottolog classification. The process may be halted part-way down the Glottolog tree. """ if depth > maxdepth: # We're done, so terminate recursion return langs def subgroup(name, depth): ancestors = self.classifications[name.lower()] return ancestors[depth][0] if depth < len(ancestors) else '' def sortkey(i): """ Callable to pass into `sorted` to port sorting behaviour from py2 to py3. :param i: Either a string or a list (of lists, ...) of strings. :return: Pair (nesting level, first string) """ d = 0 while isinstance(i, list): d -= 1 i = i[0] if i else '' return d, i # Find the ancestor of all the given languages at at particular depth # (i.e. look `depth` nodes below the root of the Glottolog tree) levels = list(set([subgroup(l, depth) for l in langs])) if len(levels) == 1: # If all languages belong to the same classificatio at this depth, # there are two possibilities if levels[0] == "": # If the common classification is an empty string, then we know # that there is no further refinement possible, so stop # the recursion here. langs.sort() return langs else: # If the common classification is non-empty, we need to # descend further, since some languages might get # separated later return self.make_monophyly_structure(langs, depth+1, maxdepth) else: # If the languages belong to multiple classifications, split them # up accordingly and then break down each classification # individually. partition = [[l for l in langs if subgroup(l, depth) == level] for level in levels] partition = [part for part in partition if part] return sorted( [self.make_monophyly_structure(group, depth+1, maxdepth) for group in partition], key=sortkey) def check_monophyly_structure(self, struct): """ Return True if the monophyly structure represented by struct is considered "meaningful", i.e. encodes something other than an unstructured polytomy. """ # TODO: Make this more rigorous. # Current test will fail ['foo', 'bar', 'baz'], but # will pass [['foo'], ['bar'], ['baz']], which is no better. if not any([type(x) == list for x in struct]): # Struct is just a list of language names, with no internal structure return False return True def make_monophyly_string(self, struct, depth=0): """ Converts a structure of nested lists into Newick string. """ if not type([]) in [type(x) for x in struct]: return "(%s)" % ",".join(struct) else: return "(%s)" % ",".join([self.make_monophyly_string(substruct) for substruct in struct]) def instantiate_clocks(self): """ Populates self.clocks with a list of BaseClock subclasses, one for each dictionary of settings in self.clock_configs. """ self.clocks = [] self.clocks_by_name = {} for config in self.clock_configs: if config["type"].lower() == "strict": clock = strict.StrictClock(config, self) elif config["type"].lower() == "relaxed": clock = relaxed.relaxed_clock_factory(config, self) elif config["type"].lower() == "random": clock = random.RandomLocalClock(config, self) self.clocks.append(clock) self.clocks_by_name[clock.name] = clock # Create default clock if necessary if "default" not in self.clocks_by_name: config = {} config["name"] = "default" config["type"] = "strict" clock = strict.StrictClock(config, self) self.clocks.append(clock) self.clocks_by_name[clock.name] = clock def instantiate_models(self): """ Populates self.models with a list of BaseModel subclasses, one for each dictionary of settings in self.model_configs. """ if not (self.model_configs or self.geo_config): raise ValueError("No models or geography specified!") # Handle request to read data from stdin if self.stdin_data: for config in self.model_configs: config["data"] = "stdin" self.models = [] for config in self.model_configs: # Validate config if "model" not in config: raise ValueError("Model not specified for model section %s." % config["name"]) if "data" not in config: raise ValueError("Data source not specified in model section %s." % config["name"]) # Instantiate model if config["model"].lower() == "bsvs": model = bsvs.BSVSModel(config, self) if "bsvs_used" not in self.message_flags: self.message_flags.append("bsvs_used") self.messages.append(bsvs.BSVSModel.package_notice) elif config["model"].lower() == "covarion": model = covarion.CovarionModel(config, self) elif config["model"].lower() == "mk": model = mk.MKModel(config, self) if "mk_used" not in self.message_flags: self.message_flags.append("mk_used") self.messages.append(mk.MKModel.package_notice) else: try: sys.path.insert(0, os.getcwd()) module_path, class_name = config["model"].rsplit(".",1) module = importlib.import_module(module_path) UserClass = getattr(module, class_name) except: raise ValueError("Unknown model type '%s' for model section '%s', and failed to import a third-party model." % (config["model"], config["name"])) model = UserClass(config, self) self.messages.extend(model.messages) self.models.append(model) if self.geo_config: self.geo_model = geo.GeoModel(self.geo_config, self) self.messages.extend(self.geo_model.messages) self.all_models = [self.geo_model] + self.models else: self.all_models = self.models def link_clocks_to_models(self): """ Ensures that for each model object in self.models, the attribute "clock" is a reference to one of the clock objects in self.clocks. Also determine which clock to estimate the mean of. """ for model in self.all_models: if model.clock: # User has explicitly specified a clock if model.clock not in self.clocks_by_name: raise ValueError("Unknown clock '%s' for model section '%s'." % (model.clock, model.name)) model.clock = self.clocks_by_name[model.clock] elif model.name in self.clocks_by_name: # Clock is associated by a common name model.clock = self.clocks_by_name[model.name] else: # No clock specification - use default model.clock = self.clocks_by_name["default"] model.clock.is_used = True # Warn user about unused clock(s) (but not the default clock) for clock in self.clocks: if clock.name != "default" and not clock.is_used: self.messages.append("""[INFO] Clock %s is not being used. Change its name to "default", or explicitly associate it with a model.""" % clock.name) # Remove unused clocks from the master clock list self.clocks = [c for c in self.clocks if c.is_used] # Get a list of model (i.e. non-geo) clocks for which the user has not # indicated a preference on whether the mean should be estimated free_clocks = list(set([m.clock for m in self.models if m.clock.is_used and m.clock.estimate_mean == None])) if free_clocks: # To begin with, estimate all free clocks for clock in free_clocks: clock.estimate_mean = True # But if the tree is arbitrary, then fix one free clock, unless the # user has fixed an un-free clock if self.arbitrary_tree and all( [m.clock.estimate_mean for m in self.models]): free_clocks[0].estimate_mean = False self.messages.append("""[INFO] Clock "%s" has had it's mean fixed to 1.0. Tree branch lengths are in units of expected substitutions for features in models using this clock.""" % free_clocks[0].name) # Determine whether or not precision-scaling is required if self.geo_config: self.geo_model.scale_precision = False geo_clock = self.geo_model.clock for m in self.models: if m.clock == geo_clock: self.messages.append("""[WARNING] Geography model is sharing a clock with one or more data models. This may lead to a bad fit.""") self.geo_model.scale_precision = True break # If geo has it's own clock, estimate the mean if not self.geo_model.scale_precision: self.geo_model.clock.estimate_mean = True def build_language_list(self): """ Combines the language sets of each model's data set, according to the value of self.overlap, to construct a final list of all the languages in the analysis. """ if self.models: self.languages = set(self.models[0].data.keys()) else: # There are no models # So this must be a geography-only analysis # Start with all languages in Glottolog, then apply filters self.languages = [l for l in self.classifications if self.filter_language(l)] self.overlap_warning = False for model in self.models: addition = set(model.data.keys()) # If we're about to do a non-trivial union/intersect, alert the # user. if addition != self.languages and not self.overlap_warning: self.messages.append("""[INFO] Not all data files have equal language sets. BEASTling will use the %s of all language sets. Set the "overlap" option in [languages] to change this.""" % self.overlap.lower()) self.overlap_warning = True if self.overlap.lower() == "union": self.languages = set.union(self.languages, addition) elif self.overlap.lower() == "intersection": self.languages = set.intersection(self.languages, addition) ## Make sure there's *something* left if not self.languages: raise ValueError("No languages specified!") ## Convert back into a sorted list self.languages = sorted(self.languages) self.messages.append("[INFO] %d languages included in analysis." % len(self.languages)) def instantiate_calibrations(self): self.calibrations = {} useless_calibrations = [] for clade, cs in self.calibration_configs.items(): orig_clade = clade[:] orig_cs = cs[:] originate = False # First parse the clade identifier # Might be "root", or else a Glottolog identifier if clade.lower() == "root": langs = self.languages else: # First check for originate() if clade.lower().startswith("originate(") and clade.endswith(")"): originate = True clade = clade[10:-1] langs = self.get_languages_by_glottolog_clade(clade) if len(langs) < 2: self.messages.append("[INFO] Calibration on clade %s ignored as no matching languages in analysis." % clade) continue # Next parse the calibration string if cs.count("(") == 1 and cs.count(")") == 1: dist_type, cs = cs.split("(", 1) dist_type = dist_type.lower() cs = cs[0:-1] else: # Default to normal dist_type = "normal" if cs.count(",") == 1 and not any([x in cs for x in ("<", ">")]): # We've got explicit params p1, p2 = map(float,cs.split(",")) elif cs.count("-") == 1 and not any([x in cs for x in (",", "<", ">")]): # We've got a 95% HPD range lower, upper = map(float, cs.split("-")) mid = (lower+upper) / 2.0 if dist_type == "normal": p1 = (upper + lower) / 2.0 p2 = (upper - mid) / 1.96 elif dist_type == "lognormal": p1 = math.log(mid) p2a = (p1 - math.log(lower)) / 1.96 p2b = (math.log(upper) - p1) / 1.96 p2 = (p2a+p2b)/2.0 elif dist_type == "uniform": p1 = lower p2 = upper elif (cs.count("<") == 1 or cs.count(">") == 1) and not any([x in cs for x in (",", "-")]): # We've got a single bound dist_type = "uniform" sign, bound = cs.split() if sign.strip() == "<": p1 = 0.0 p2 = float(bound.strip()) else: p1 = float(bound.strip()) p2 = "Infinity" else: raise ValueError("Could not parse calibration \"%s\" for clade %s" % (orig_cs, orig_clade)) clade_identifier = "originate_%s" % clade if originate else clade self.calibrations[clade_identifier] = Calibration(langs, originate, dist_type, p1, p2) def get_languages_by_glottolog_clade(self, clade): langs = [] for l in self.languages: for name, glottocode in self.classifications.get(l.lower(),""): if clade.lower() == name.lower() or clade.lower() == glottocode: langs.append(l) break return langs def handle_starting_tree(self): """ Makes any changes to the user-provided starting tree required to make it suitable for passing to BEAST. In particular, this method checks that the supplied string or the contents of the supplied file: * seems to be a valid Newick tree * contains no duplicate taxa * has taxa which are a superset of the languages in the analysis * has no polytomies or unifurcations. """ if os.path.exists(self.starting_tree): with io.open(self.starting_tree, encoding="UTF-8") as fp: self.starting_tree = fp.read().strip() if self.starting_tree: # Make sure starting tree can be parsed try: tree = newick.loads(self.starting_tree)[0] except: raise ValueError("Could not parse starting tree. Is it valid Newick?") # Make sure starting tree contains no duplicate taxa tree_langs = [n.name for n in tree.walk() if n.is_leaf] if not len(set(tree_langs)) == len(tree_langs): dupes = [l for l in tree_langs if tree_langs.count(l) > 1] dupestring = ",".join(["%s (%d)" % (d, tree_langs.count(d)) for d in dupes]) raise ValueError("Starting tree contains duplicate taxa: %s" % dupestring) tree_langs = set(tree_langs) # Make sure languges in tree is a superset of languages in the analysis if not tree_langs.issuperset(self.languages): missing_langs = set(self.languages).difference(tree_langs) miss_string = ",".join(missing_langs) raise ValueError("Some languages in the data are not in the starting tree: %s" % miss_string) # If the trees' language set is a proper superset, prune the tree to fit the analysis if not tree_langs == self.languages: tree.prune_by_names(self.languages, inverse=True) self.messages.append("[INFO] Starting tree includes languages not present in any data set and will be pruned.") # Get the tree looking nice tree.remove_redundant_nodes() tree.resolve_polytomies() # Replace the starting_tree from the config with the new one self.starting_tree = newick.dumps(tree)
def from_path(cls, path, api=None): """BibTeX files from `<path>/bibtex/*.bib` if listed in `<path>/BIBFILES.ini`.""" if not isinstance(path, Path): path = Path(path) ini = INI.from_file(path / 'BIBFILES.ini', interpolation=None) return cls(cls._iterbibfiles(ini, path / 'bibtex', api=api))
def test_existing_config(self): cfg = INI() cfg.read_dict({'section': {'option': '12'}}) cfg.write(self.tmp_path('test.ini')) cfg = Config('test', dir_=self.tmp_path()) self.assertEqual(cfg.get('section', 'option'), '12')
def read_from_file(self, configfile): """ Read one or several INI-style configuration files and overwrite default option settings accordingly. """ self.configfile = INI(interpolation=None) self.configfile.optionxform = str if isinstance(configfile, dict): self.configfile.read_dict(configfile) else: if isinstance(configfile, six.string_types): configfile = (configfile, ) for conf in configfile: self.configfile.read(conf) p = self.configfile for sec, opts in { 'admin': { 'basename': p.get, 'embed_data': p.getboolean, 'screenlog': p.getboolean, 'log_all': p.getboolean, 'log_dp': p.getint, 'log_every': p.getint, 'log_probabilities': p.getboolean, 'log_fine_probs': p.getboolean, 'log_params': p.getboolean, 'log_trees': p.getboolean, 'log_pure_tree': p.getboolean, 'glottolog_release': p.get, }, 'MCMC': { 'chainlength': p.getint, 'sample_from_prior': p.getboolean, }, 'languages': { 'exclusions': p.get, 'languages': p.get, 'families': p.get, 'macroareas': p.get, 'location_data': p.get, 'overlap': p.get, 'starting_tree': p.get, 'sample_branch_lengths': p.getboolean, 'sample_topology': p.getboolean, 'monophyly_start_depth': p.getint, 'monophyly_end_depth': p.getint, 'monophyly_levels': p.getint, 'monophyly_direction': lambda s, o: p.get(s, o).lower(), }, }.items(): for opt, getter in opts.items(): if p.has_option(sec, opt): setattr(self, opt, getter(sec, opt)) ## MCMC self.sample_from_prior |= self.prior if self.prior and not self.basename.endswith("_prior"): self.basename += "_prior" ## Languages sec = "languages" if self.overlap.lower() not in ("union", "intersection"): # pragma: no cover raise ValueError( "Value for overlap needs to be either 'union', or 'intersection'." ) if p.has_option(sec, "monophyletic"): self.monophyly = p.getboolean(sec, "monophyletic") elif p.has_option(sec, "monophyly"): self.monophyly = p.getboolean(sec, "monophyly") if p.has_option(sec, "monophyly_newick"): value = p.get(sec, "monophyly_newick") if os.path.exists(value): with io.open(value, encoding="UTF-8") as fp: self.monophyly_newick = fp.read() else: self.monophyly_newick = value if p.has_option(sec, 'minimum_data'): self.minimum_data = p.getfloat(sec, "minimum_data") ## Calibration if p.has_section("calibration"): for clade, calibration in p.items("calibration"): self.calibration_configs[clade] = calibration ## Clocks clock_sections = [ s for s in p.sections() if s.lower().startswith("clock") ] for section in clock_sections: self.clock_configs.append(self.get_clock_config(p, section)) ## Models model_sections = [ s for s in p.sections() if s.lower().startswith("model") ] for section in model_sections: self.model_configs.append(self.get_model_config(p, section)) # Geography if p.has_section("geography"): self.geo_config = self.get_geo_config(p, "geography") else: self.geo_config = {} if p.has_section("geo_priors"): if not p.has_section("geography"): raise ValueError( "Config file contains geo_priors section but no geography section." ) self.geo_config["geo_priors"] = {} for clades, klm in p.items("geo_priors"): for clade in clades.split(','): clade = clade.strip() if clade not in self.geo_config["sampling_points"]: self.geo_config["sampling_points"].append(clade) self.geo_config["geo_priors"][clade] = klm sampled_points = self.geo_config.get("sampling_points", []) if [p for p in sampled_points if p.lower() != "root" ] and self.sample_topology and not self.monophyly: self.messages.append( "[WARNING] Geographic sampling and/or prior specified for clades other than root, but tree topology is being sampled without monophyly constraints. BEAST may crash." ) # Make sure analysis is non-empty if not model_sections and not self.geo_config: raise ValueError( "Config file contains no model sections and no geography section." )
def read_ini(filename, interpolation=None): return INI.from_file(filename, interpolation=interpolation)
def _bibfiles(cls, directory, config, endwith): """Read the INI-file, yield bibfile instances for sections.""" cfg = INI(interpolation=None) cfg.read(os.path.join(directory, '..', config)) for s in cfg.sections(): if not s.endswith(endwith): continue filepath = os.path.join(directory, s) assert os.path.exists(filepath) sortkey = cfg.get(s, 'sortkey') if sortkey.lower() == 'none': sortkey = None yield BibFile( filepath=filepath, encoding=cfg.get(s, 'encoding'), sortkey=sortkey, use_pybtex=cfg.getboolean(s, 'use_pybtex'), priority=cfg.getint(s, 'priority'), name=cfg.get(s, 'name'), title=cfg.get(s, 'title'), description=cfg.get(s, 'description'), abbr=cfg.get(s, 'abbr'))