Exemple #1
0
def get_score(
        wl, ref, mode, taxA, taxB, concepts_attr='concepts',
        ignore_missing=False):
    if mode in ['shared', 'jaccard']:
        listA, listB = [
                wl.get_list(col=tax, entry=ref) for tax in [taxA, taxB]]
        shared = [x for x in listA if x in listB if x != 0]

        if mode == 'jaccard':
            return 1 - len(set(shared)) / len(set(listA + listB))
        return len(shared)

    assert mode == 'swadesh'
    # get the two dictionaries
    dictA, dictB = [wl.get_dict(col=tax, entry=ref) for tax in [taxA, taxB]]

    # count amount of shared concepts
    shared, missing = 0, 0

    for concept in getattr(wl, concepts_attr):
        if concept not in dictA or concept not in dictB:
            missing += 1 if not ignore_missing else 0
        elif [k for k in dictA[concept] if k in dictB[concept]]:
            shared += 1

    try:
        return 1 - shared / (wl.height - missing)
    except ZeroDivisionError:
        log.get_logger().exception(
            "Zero-division error encountered in '{0}' and '{1}'.".format(
                taxA, taxB))
        return 1.0
Exemple #2
0
def write_text_file(path, content, normalize=None, log=True):
    """Write a text file encoded in utf-8.

    :param path: File-system path of the file.
    :content: The text content to be written.
    :param normalize: If not `None` a valid unicode normalization mode must be passed.
    """
    if not isinstance(content, text_type):
        content = lines_to_text(content)
    with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp:
        fp.write(unicodedata.normalize(normalize, content) if normalize else content)
    if log:
        get_logger().info("Data has been written to file <{0}>.".format(_str_path(path)))
Exemple #3
0
def write_text_file(path, content, normalize=None, log=True):
    """Write a text file encoded in utf-8.

    :param path: File-system path of the file.
    :content: The text content to be written.
    :param normalize: If not `None` a valid unicode normalization mode must be passed.
    """
    if not isinstance(content, text_type):
        content = lines_to_text(content)
    with io.open(_str_path(path, mkdir=True), 'w', encoding='utf8') as fp:
        fp.write(
            unicodedata.normalize(normalize, content) if normalize else content
        )
    if log:
        get_logger().info("Data has been written to file <{0}>.".format(
            _str_path(path)))
Exemple #4
0
 def _recreate_unpicklables(self):
     """run `eval` on the string representations."""
     self.log = log.get_logger()
     self._class = {
         key: eval(value)
         for key, value in self._class_string.items()
     }
Exemple #5
0
 def _log(method='debug', with_context_manager=False, level=logging.DEBUG):
     logger = log.get_logger(
         test=True, force_default_config=True, config_dir=self.tmp_path())
     method = getattr(logger, method)
     if with_context_manager:
         with log.Logging(logger=logger, level=level):
             method('')
     else:
         method('')
Exemple #6
0
    def _log(method='debug', with_context_manager=False, level=logging.DEBUG):
        logger = log.get_logger(test=True, force_default_config=True, config_dir=tmppath)
        method = getattr(logger, method)

        if with_context_manager:
            with log.Logging(logger=logger, level=level):
                method('')
        else:
            method('')
        return capsys.readouterr().err
Exemple #7
0
        def _log(method='debug', with_context_manager=False,
                 level=logging.DEBUG):
            logger = log.get_logger(test=True, force_default_config=True,
                                    config_dir=self.tmp_path())

            method = getattr(logger, method)

            if with_context_manager:
                with log.Logging(logger=logger, level=level):
                    method('')
            else:
                method('')
Exemple #8
0
 def test_new_config(self):
     l = log.get_logger(config_dir=self.tmp.as_posix(), test=True)
     self.assertTrue(hasattr(l, 'info'))
Exemple #9
0
 def __exit__(self, type, value, traceback):
     self.fp.close()
     if self.log:
         get_logger().info("Data has been written to file <{0}>.".format(
             _str_path(self.path)))
Exemple #10
0
 def test_new_config(self):
     new_cfg = log.get_logger(config_dir=self.tmp.as_posix(), test=True)
     self.assertTrue(hasattr(new_cfg, 'info'))
Exemple #11
0
def calculate_data(wordlist,
                   data,
                   taxa='taxa',
                   concepts='concepts',
                   ref='cogid',
                   **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(keywords,
                     distances=False,
                     tree_calc="upgma",
                     cluster="upgma",
                     force=False,
                     threshold=0.5,
                     cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(wordlist, taxa, concepts, ref,
                                             **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warning("Reference tree has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warning("Distance matrix has already been calculated, "
                           "force overwrite by "
                           "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Exemple #12
0
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True
            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                print(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [a for a, b in sorted(
                filename.header.items(),
                key=lambda x: x[1],
                reverse=False)]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError("Unrecognized type for 'filename' argument: {0}".format(
                type(filename).__name__))

        # load the configuration file
        if not conf:
            conf = util.data_path('conf', 'qlc.rc')

        # read the file defined by its path in conf
        tmp = [line.split('\t') for line in util.read_config_file(conf)]

        # define two attributes, _alias, and _class which store the aliases and
        # the datatypes (classes) of the given entries
        self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {}
        for name, cls, alias in tmp:
            # make sure the name itself is there
            self._alias[name.lower()] = self._alias[name.upper()] = name
            self._class[name.lower()] = self._class[name.upper()] = eval(cls)
            self._class_string[name.lower()] = self._class_string[name.upper()] = cls

            # add the aliases
            for a in alias.split(','):
                self._alias[a.lower()] = self._alias[a.upper()] = name
                self._class[a.lower()] = self._class[a.upper()] = eval(cls)
                self._class_string[a.lower()] = self._class_string[a.upper()] = cls

            self._alias2[name] = sorted(set(alias.split(','))) + [name]

        # append the names in data[0] to self.conf to make sure that all data
        # is covered, even the types which are not specifically defined in the
        # conf file. the datatype defaults here to "str"
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]], range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()}
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' + ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warning(
                                logstring.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))
                        except ValueError:
                            log.warning(
                                logstring.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))

        # create entry attribute of the wordlist
        self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
Exemple #13
0
    def test_default_config(self):
        from lingpy.log import get_logger

        log = get_logger(config_dir=self.tmp, force_default_config=True)
        self.assertTrue(hasattr(log, 'info'))
Exemple #14
0
def test_new_config(fresh_logger, tmppath):
    new_cfg = log.get_logger(config_dir=tmppath, test=True)
    assert hasattr(new_cfg, 'info')
Exemple #15
0
 def __exit__(self, type, value, traceback):
     self.fp.close()
     if self.log:
         get_logger().info("Data has been written to file <{0}>.".format(_str_path(self.path)))
Exemple #16
0
 def test_default_config(self):
     default_cfg = log.get_logger(config_dir=self.tmp.as_posix(),
                                  force_default_config=True)
     self.assertTrue(hasattr(default_cfg, 'info'))
Exemple #17
0
def calculate_data(
        wordlist,
        data,
        taxa='taxa',
        concepts='concepts',
        ref='cogid',
        **keywords):
    """
    Manipulate a wordlist object by adding different kinds of data.

    Parameters
    ----------
    data : str
        The type of data that shall be calculated. Currently supports

        * "tree": calculate a reference tree based on shared cognates
        * "dst": get distances between taxa based on shared cognates
        * "cluster": cluster the taxa into groups using different methods


    """
    logger = log.get_logger()
    util.setdefaults(
        keywords,
        distances=False,
        tree_calc="upgma",
        cluster="upgma",
        force=False,
        threshold=0.5,
        cluster_method='upgma')

    # get taxa for current calculation
    these_taxa = eval('wordlist.' + taxa)

    # calculate distances
    if data in ['distances', 'dst']:
        wordlist._meta['distances'] = wl2dst(
                wordlist, taxa, concepts, ref, **keywords)
    elif data in ['diversity', 'div']:
        etd = wordlist.get_etymdict(ref=ref)
        wordlist._meta['diversity'] = \
            (len(etd) - wordlist.height) / (len(wordlist) - wordlist.height)
    elif data in ['tre', 'tree', 'nwk']:
        if 'distances' not in wordlist._meta:
            wordlist._meta['distances'] = \
                wl2dst(wordlist, taxa, concepts, ref, **keywords)
        distances = wordlist._meta['distances']
        if 'tree' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Reference tree has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['tree'] = clustering.matrix2tree(
            distances, these_taxa, keywords['tree_calc'],
            keywords['distances'])

    elif data in ['groups', 'cluster']:
        if 'distances' not in wordlist._meta:
            distances = wl2dst(wordlist, taxa, concepts, ref, **keywords)
        else:
            distances = wordlist._meta['distances']
        if 'groups' in wordlist._meta and not keywords['force']:
            logger.warn(
                    "Distance matrix has already been calculated, "
                    "force overwrite by "
                    "setting 'force' to 'True'.")
            return
        wordlist._meta['groups'] = clustering.matrix2groups(
            keywords['threshold'], distances, these_taxa,
            keywords['cluster_method'])
    log.info("Successfully calculated {0}.".format(data))
Exemple #18
0
 def test_new_config(self):
     l = log.get_logger(config_dir=self.tmp, test=True)
     self.assertTrue(hasattr(l, 'info'))
Exemple #19
0
    def test_new_config(self):
        from lingpy.log import get_logger

        log = get_logger(config_dir=self.tmp, test=True)
        self.assertTrue(hasattr(log, 'info'))
Exemple #20
0
 def test_default_config(self):
     l = log.get_logger(config_dir=self.tmp.as_posix(),
                        force_default_config=True)
     self.assertTrue(hasattr(l, 'info'))
Exemple #21
0
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True
            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                log.warning(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [
                a for a, b in sorted(
                    filename.header.items(), key=lambda x: x[1], reverse=False)
            ]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError(
                "Unrecognized type for 'filename' argument: {0}".format(
                    type(filename).__name__))

        self._alias, self._class, self._class_string, self._alias2 = read_conf(
            conf)
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]],
                range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v
            for k, v in input_data.items() if k != 0 and str(k).isnumeric()
        }
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' +
                             ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](
                                self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warning(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))
                        except ValueError:
                            log.warning(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))

        # create entry attribute of the wordlist
        self.entries = sorted(
            set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
Exemple #22
0
 def _recreate_unpicklables(self):
     """run `eval` on the string representations."""
     self.log = log.get_logger()
     self._class = {key: eval(value) for key, value in self._class_string.items()}
Exemple #23
0
def test_default_config(fresh_logger, tmppath):
    default_cfg = log.get_logger(config_dir=tmppath, force_default_config=True)
    assert hasattr(default_cfg, 'info')
Exemple #24
0
 def test_default_config(self):
     l = log.get_logger(config_dir=self.tmp, force_default_config=True)
     self.assertTrue(hasattr(l, 'info'))