def get_score( wl, ref, mode, taxA, taxB, concepts_attr='concepts', ignore_missing=False): if mode in ['shared', 'jaccard']: listA, listB = [ wl.get_list(col=tax, entry=ref) for tax in [taxA, taxB]] shared = [x for x in listA if x in listB if x != 0] if mode == 'jaccard': return 1 - len(set(shared)) / len(set(listA + listB)) return len(shared) assert mode == 'swadesh' # get the two dictionaries dictA, dictB = [wl.get_dict(col=tax, entry=ref) for tax in [taxA, taxB]] # count amount of shared concepts shared, missing = 0, 0 for concept in getattr(wl, concepts_attr): if concept not in dictA or concept not in dictB: missing += 1 if not ignore_missing else 0 elif [k for k in dictA[concept] if k in dictB[concept]]: shared += 1 try: return 1 - shared / (wl.height - missing) except ZeroDivisionError: log.get_logger().exception( "Zero-division error encountered in '{0}' and '{1}'.".format( taxA, taxB)) return 1.0
def write_text_file(path, content, normalize=None, log=True): """Write a text file encoded in utf-8. :param path: File-system path of the file. :content: The text content to be written. :param normalize: If not `None` a valid unicode normalization mode must be passed. """ if not isinstance(content, text_type): content = lines_to_text(content) with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp: fp.write(unicodedata.normalize(normalize, content) if normalize else content) if log: get_logger().info("Data has been written to file <{0}>.".format(_str_path(path)))
def write_text_file(path, content, normalize=None, log=True): """Write a text file encoded in utf-8. :param path: File-system path of the file. :content: The text content to be written. :param normalize: If not `None` a valid unicode normalization mode must be passed. """ if not isinstance(content, text_type): content = lines_to_text(content) with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp: fp.write( unicodedata.normalize(normalize, content) if normalize else content ) if log: get_logger().info("Data has been written to file <{0}>.".format( _str_path(path)))
def _recreate_unpicklables(self): """run `eval` on the string representations.""" self.log = log.get_logger() self._class = { key: eval(value) for key, value in self._class_string.items() }
def _log(method='debug', with_context_manager=False, level=logging.DEBUG): logger = log.get_logger( test=True, force_default_config=True, config_dir=self.tmp_path()) method = getattr(logger, method) if with_context_manager: with log.Logging(logger=logger, level=level): method('') else: method('')
def _log(method='debug', with_context_manager=False, level=logging.DEBUG): logger = log.get_logger(test=True, force_default_config=True, config_dir=tmppath) method = getattr(logger, method) if with_context_manager: with log.Logging(logger=logger, level=level): method('') else: method('') return capsys.readouterr().err
def _log(method='debug', with_context_manager=False, level=logging.DEBUG): logger = log.get_logger(test=True, force_default_config=True, config_dir=self.tmp_path()) method = getattr(logger, method) if with_context_manager: with log.Logging(logger=logger, level=level): method('') else: method('')
def test_new_config(self): l = log.get_logger(config_dir=self.tmp.as_posix(), test=True) self.assertTrue(hasattr(l, 'info'))
def __exit__(self, type, value, traceback): self.fp.close() if self.log: get_logger().info("Data has been written to file <{0}>.".format( _str_path(self.path)))
def test_new_config(self): new_cfg = log.get_logger(config_dir=self.tmp.as_posix(), test=True) self.assertTrue(hasattr(new_cfg, 'info'))
def calculate_data(wordlist, data, taxa='taxa', concepts='concepts', ref='cogid', **keywords): """ Manipulate a wordlist object by adding different kinds of data. Parameters ---------- data : str The type of data that shall be calculated. Currently supports * "tree": calculate a reference tree based on shared cognates * "dst": get distances between taxa based on shared cognates * "cluster": cluster the taxa into groups using different methods """ logger = log.get_logger() util.setdefaults(keywords, distances=False, tree_calc="upgma", cluster="upgma", force=False, threshold=0.5, cluster_method='upgma') # get taxa for current calculation these_taxa = eval('wordlist.' + taxa) # calculate distances if data in ['distances', 'dst']: wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref, **keywords) elif data in ['diversity', 'div']: etd = wordlist.get_etymdict(ref=ref) wordlist._meta['diversity'] = \ (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height) elif data in ['tre', 'tree', 'nwk']: if 'distances' not in wordlist._meta: wordlist._meta['distances'] = \ wl2dst(wordlist, taxa, concepts, ref, **keywords) distances = wordlist._meta['distances'] if 'tree' in wordlist._meta and not keywords['force']: logger.warning("Reference tree has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['tree'] = clustering.matrix2tree( distances, these_taxa, keywords['tree_calc'], keywords['distances']) elif data in ['groups', 'cluster']: if 'distances' not in wordlist._meta: distances = wl2dst(wordlist, taxa, concepts, ref, **keywords) else: distances = wordlist._meta['distances'] if 'groups' in wordlist._meta and not keywords['force']: logger.warning("Distance matrix has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['groups'] = clustering.matrix2groups( keywords['threshold'], distances, these_taxa, keywords['cluster_method']) log.info("Successfully calculated {0}.".format(data))
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): print(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False)] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError("Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) # load the configuration file if not conf: conf = util.data_path('conf', 'qlc.rc') # read the file defined by its path in conf tmp = [line.split('\t') for line in util.read_config_file(conf)] # define two attributes, _alias, and _class which store the aliases and # the datatypes (classes) of the given entries self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {} for name, cls, alias in tmp: # make sure the name itself is there self._alias[name.lower()] = self._alias[name.upper()] = name self._class[name.lower()] = self._class[name.upper()] = eval(cls) self._class_string[name.lower()] = self._class_string[name.upper()] = cls # add the aliases for a in alias.split(','): self._alias[a.lower()] = self._alias[a.upper()] = name self._class[a.lower()] = self._class[a.upper()] = eval(cls) self._class_string[a.lower()] = self._class_string[a.upper()] = cls self._alias2[name] = sorted(set(alias.split(','))) + [name] # append the names in data[0] to self.conf to make sure that all data # is covered, even the types which are not specifically defined in the # conf file. the datatype defaults here to "str" for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()} # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head](self._data[key][i]) check.append(i) except KeyError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) except ValueError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key]]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def test_default_config(self): from lingpy.log import get_logger log = get_logger(config_dir=self.tmp, force_default_config=True) self.assertTrue(hasattr(log, 'info'))
def test_new_config(fresh_logger, tmppath): new_cfg = log.get_logger(config_dir=tmppath, test=True) assert hasattr(new_cfg, 'info')
def __exit__(self, type, value, traceback): self.fp.close() if self.log: get_logger().info("Data has been written to file <{0}>.".format(_str_path(self.path)))
def test_default_config(self): default_cfg = log.get_logger(config_dir=self.tmp.as_posix(), force_default_config=True) self.assertTrue(hasattr(default_cfg, 'info'))
def calculate_data( wordlist, data, taxa='taxa', concepts='concepts', ref='cogid', **keywords): """ Manipulate a wordlist object by adding different kinds of data. Parameters ---------- data : str The type of data that shall be calculated. Currently supports * "tree": calculate a reference tree based on shared cognates * "dst": get distances between taxa based on shared cognates * "cluster": cluster the taxa into groups using different methods """ logger = log.get_logger() util.setdefaults( keywords, distances=False, tree_calc="upgma", cluster="upgma", force=False, threshold=0.5, cluster_method='upgma') # get taxa for current calculation these_taxa = eval('wordlist.' + taxa) # calculate distances if data in ['distances', 'dst']: wordlist._meta['distances'] = wl2dst( wordlist, taxa, concepts, ref, **keywords) elif data in ['diversity', 'div']: etd = wordlist.get_etymdict(ref=ref) wordlist._meta['diversity'] = \ (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height) elif data in ['tre', 'tree', 'nwk']: if 'distances' not in wordlist._meta: wordlist._meta['distances'] = \ wl2dst(wordlist, taxa, concepts, ref, **keywords) distances = wordlist._meta['distances'] if 'tree' in wordlist._meta and not keywords['force']: logger.warn( "Reference tree has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['tree'] = clustering.matrix2tree( distances, these_taxa, keywords['tree_calc'], keywords['distances']) elif data in ['groups', 'cluster']: if 'distances' not in wordlist._meta: distances = wl2dst(wordlist, taxa, concepts, ref, **keywords) else: distances = wordlist._meta['distances'] if 'groups' in wordlist._meta and not keywords['force']: logger.warn( "Distance matrix has already been calculated, " "force overwrite by " "setting 'force' to 'True'.") return wordlist._meta['groups'] = clustering.matrix2groups( keywords['threshold'], distances, these_taxa, keywords['cluster_method']) log.info("Successfully calculated {0}.".format(data))
def test_new_config(self): l = log.get_logger(config_dir=self.tmp, test=True) self.assertTrue(hasattr(l, 'info'))
def test_new_config(self): from lingpy.log import get_logger log = get_logger(config_dir=self.tmp, test=True) self.assertTrue(hasattr(log, 'info'))
def test_default_config(self): l = log.get_logger(config_dir=self.tmp.as_posix(), force_default_config=True) self.assertTrue(hasattr(l, 'info'))
def __init__(self, filename, conf=''): """ Parse data regularly if the data has not been loaded from a pickled version. """ self.log = log.get_logger() # try to load the data internal_import = False # check whether it's a dictionary from which we load if isinstance(filename, dict): input_data = filename if 'filename' not in input_data: self.filename = rcParams['filename'] internal_import = True # make check for correct input, there was a bug with a wrong # evaluation which is hopefully fixed by now tmp_keys = [k for k in input_data if isinstance(k, int)] if len(input_data[0]) != len(input_data[tmp_keys[0]]): log.warning(input_data[0], input_data[tmp_keys[0]]) raise ValueError("[!] Wrong input format!") # pragma: no cover # check whether it's another wordlist-object elif hasattr(filename, '_data') and hasattr(filename, '_meta'): input_data = dict([(key, [v for v in value]) for key, value in \ filename._data.items()]) input_data.update(filename._meta.items()) input_data[0] = [ a for a, b in sorted( filename.header.items(), key=lambda x: x[1], reverse=False) ] internal_import = True self.filename = rcParams['filename'] # or whether the data is an actual file elif isinstance(filename, string_types) and os.path.isfile(filename): input_data = read_qlc(filename) self.filename = filename # raise an error otherwise elif isinstance(filename, string_types): raise IOError("Input file '{0}' does not exist.".format(filename)) else: raise TypeError( "Unrecognized type for 'filename' argument: {0}".format( type(filename).__name__)) self._alias, self._class, self._class_string, self._alias2 = read_conf( conf) for name in input_data[0]: if name.lower() not in self._alias: self._alias[name.lower()] = name.lower() self._class[name.lower()] = str if name.upper() not in self._alias: self._alias[name.upper()] = name.lower() self._class[name.upper()] = str # add empty alias for empty strings XXX why was that? I can't remember # why this was important XXX self._alias[''] = '' # the header stores the indices of the data in the original data dictionary self.header = dict( zip([self._alias[x] for x in input_data[0]], range(len(input_data[0])))) # now create a specific header which has all aliases self._header = {k: v for k, v in self.header.items()} # add a sorted header for reference self.columns = sorted(self.header, key=lambda x: self.header[x]) # assign all aliases to the header for alias in self._alias: try: self._header[alias] = self._header[self._alias[alias]] except: pass # assign the data as attribute to the word list class. Note that we # need to check for the type here, but since numpy also offers integer # types, we don't check for type(x) == int, but instead use the # str.numeric-function that returns numeric values only if it is an # integer self._data = { int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric() } # check for same length of all columns check_errors = '' for k, v in self._data.items(): if len(v) != len(self.header): check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format( k, len(v), len(self.header)) if check_errors: raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header))) # iterate over self._data and change the values according to the # functions (only needed when reading from file) if not internal_import: heads = sorted(self._header.items(), key=lambda x: x[1]) for key in self._data: check = [] for head, i in heads: if i not in check: logstring = 'Problem with row {0} in col {1}, expected' + \ ' «{4}» as datatype but received «{3}» ' + \ ' (ROW: {2}, entry {5}).' try: self._data[key][i] = self._class[head]( self._data[key][i]) check.append(i) except KeyError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) except ValueError: log.warning( logstring.format( key, i, '|'.join([str(x) for x in self._data[key] ]), self._data[key][i], self._class[head], head)) # create entry attribute of the wordlist self.entries = sorted( set([b.lower() for a, b in self._alias.items() if b])) # assign meta-data self._meta = {} for key in [k for k in input_data if type(k) != int]: self._meta[key] = input_data[key]
def _recreate_unpicklables(self): """run `eval` on the string representations.""" self.log = log.get_logger() self._class = {key: eval(value) for key, value in self._class_string.items()}
def test_default_config(fresh_logger, tmppath): default_cfg = log.get_logger(config_dir=tmppath, force_default_config=True) assert hasattr(default_cfg, 'info')
def test_default_config(self): l = log.get_logger(config_dir=self.tmp, force_default_config=True) self.assertTrue(hasattr(l, 'info'))