Ejemplo n.º 1
0
def tsv2triple(wordlist, outfile=None):
    """
    Function converts a wordlist to a triple data structure.

    Notes
    -----
    The basic values of which the triples consist are:
      * ID (the ID in the TSV file)
      * COLUMN (the column in the TSV file)
      * VALUE (the entry in the TSV file)
    """
    tstore = []
    for head in wordlist.header:
        log.debug('tsv2triple: ' + head)
        for key in wordlist:
            tstore.append((key, head.upper(), wordlist[key, head]))

    if outfile:
        out = ''
        for a, b, c in tstore:
            if isinstance(c, list):
                c = ' '.join([str(x) for x in c])
            if c != '-':
                out += '{0}\t{1}\t{2}\n'.format(a, b, c)
        util.write_text_file(outfile, out, normalize='NFC')
    return tstore
Ejemplo n.º 2
0
def tsv2triple(wordlist, outfile=None):
    """
    Function converts a wordlist to a triple data structure.

    Notes
    -----
    The basic values of which the triples consist are:
      * ID (the ID in the TSV file)
      * COLUMN (the column in the TSV file)
      * VALUE (the entry in the TSV file)
    """
    tstore = []
    for head in wordlist.header:
        log.debug('tsv2triple: ' + head)
        for key in wordlist:
            tstore.append((key, head.upper(), wordlist[key, head]))

    if outfile:
        out = ''
        for a, b, c in tstore:
            if isinstance(c, list):
                c = ' '.join([text_type(x) for x in c])
            if c != '-':
                out += '{0}\t{1}\t{2}\n'.format(a, b, c)
        util.write_text_file(outfile, out, normalize='NFC')
    return tstore
Ejemplo n.º 3
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Ejemplo n.º 4
0
 def test_convenience():
     info('m')
     warning('m')
     debug('m')
     error('m')
     deprecated('o', 'n')
     missing_module('m')
     file_written('f')
Ejemplo n.º 5
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Ejemplo n.º 6
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Ejemplo n.º 7
0
def normalize_alignment(alignment):
    """
    Function normalizes an alignment.

    Normalization here means that columns consisting only of gaps will be
    deleted, and all sequences will be stretched to equal length by adding
    additional gap characters in the end of smaller sequences.
    """
    # clone the alignment
    alm_clone = [[x for x in y] for y in alignment]

    # first check for alms of different length
    alm_lens = [len(alm) for alm in alm_clone]
    if alm_lens.count(1) == len(alm_lens):
        for i, alm in enumerate(alm_clone):
            alm_clone[i] = alm[0].split(' ')
            alm_lens[i] = len(alm_clone[i])

    if len(set(alm_lens)) > 1:
        max_len = max(alm_lens)
        for i, alm in enumerate(alm_clone):
            new_alm = alm + ['-' for x in range(max_len)]
            alm_clone[i] = new_alm[:max_len]

    # then check for alms consisting only of gaps
    cols = misc.transpose(alm_clone)
    idxs = []
    for i, col in enumerate(cols):
        if set(col) == set('-'):
            idxs += [i]
    for idx in idxs[::-1]:
        for i, alm in enumerate(alm_clone):
            del alm_clone[i][idx]
    if alignment != alm_clone:
        lgtxt = 'Modified the alignment:\n'
        for i in range(len(alignment)):
            lgtxt += '[!] ' + ' '.join(alignment[i]) + '->'
            lgtxt += ' '.join(alm_clone[i]) + '\n'
        log.debug(lgtxt)
        return alm_clone
    else:
        return alignment
Ejemplo n.º 8
0
def normalize_alignment(alignment):
    """
    Function normalizes an alignment.

    Normalization here means that columns consisting only of gaps will be
    deleted, and all sequences will be stretched to equal length by adding
    additional gap characters in the end of smaller sequences.
    """
    # clone the alignment
    alm_clone = [[x for x in y] for y in alignment]

    # first check for alms of different length
    alm_lens = [len(alm) for alm in alm_clone]
    if alm_lens.count(1) == len(alm_lens):
        for i, alm in enumerate(alm_clone):
            alm_clone[i] = alm[0].split(' ')
            alm_lens[i] = len(alm_clone[i])

    if len(set(alm_lens)) > 1:
        max_len = max(alm_lens)
        for i, alm in enumerate(alm_clone):
            new_alm = alm + ['-' for x in range(max_len)]
            alm_clone[i] = new_alm[:max_len]

    # then check for alms consisting only of gaps
    cols = misc.transpose(alm_clone)
    idxs = []
    for i, col in enumerate(cols):
        if set(col) == set('-'):
            idxs += [i]
    for idx in idxs[::-1]:
        for i, alm in enumerate(alm_clone):
            del alm_clone[i][idx]
    if alignment != alm_clone:
        lgtxt = 'Modified the alignment:\n'
        for i in range(len(alignment)):
            lgtxt += '[!] ' + ' '.join(alignment[i]) + '->'
            lgtxt += ' '.join(alm_clone[i]) + '\n'
        debug(lgtxt)
        return alm_clone
    else:
        return alignment
Ejemplo n.º 9
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Ejemplo n.º 10
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'),
                                          model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in range(len(chars))]
        for (i, charA), (j,
                         charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Ejemplo n.º 11
0
def star2qlc(filename, clean_taxnames=False, debug=False):
    """
    Converts a file directly output from starling to LingPy-QLC format.
    """
    cleant = clean_taxnames or identity
    data = csv2list(filename)

    # check for strange chars in data due to notepad errors
    data[0][0] = data[0][0].replace('\ufeff', '')

    # get the header
    header = data[0]

    # debugging
    if debug:
        error = False
        log.info("Header line has length {0}.".format(len(header)))
        for line in data[1:]:
            if len(line) != len(header):  # pragma: no cover
                log.error(
                    "Error for item {0} with length {1}, expected {2}.".format(
                        '/'.join(line[0:2]), len(line), len(header)))
                error = True
        if error:  # pragma: no cover
            log.error("Errors were found, aborting function call.")
            return
        else:
            log.info("Everything went fine, carrying on with function call.")

    # determine language names in header
    taxa = []
    for i in range(len(header) - 1):
        prev = header[i]
        post = header[i + 1]

        if prev in post and '#' in post:
            taxa += [prev]

            if len(taxa) == 1:
                lngIdx = i

        if prev == 'Number':
            numIdx = i

        if prev == 'Word':
            wrdIdx = i

    log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx))
    log.info('starling, taxa: %s' % taxa)

    # start filling in the dictionary
    D = {
        0: [
            'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY',
            'IPA', 'COGID'
        ]
    }

    idx = 1
    cognate_counter = 0
    current_concept = ''
    cognate_sets = []
    for line in data[2:]:
        gloss = line[wrdIdx]
        gnum = line[numIdx]

        # switch to next cognate set if there is a switch in concepts
        if current_concept != gloss and len(cognate_sets) != 0:
            max_cog = max(cognate_sets)
            cognate_counter = max_cog
            cognate_sets = []
            current_concept = gloss
        else:
            log.debug('starling, indices (%s, %s, %s)' %
                      (gloss, current_concept, cognate_counter))

        for i in range(lngIdx, len(header), 2):
            word = line[i]

            if '{' in word:
                ipa = word[:word.index('{')].strip()
                ortho = word[word.index('{') + 1:word.index('}')].strip()
            else:
                ipa = word
                ortho = word

            cogid = int(line[i + 1])

            if cogid != 0 and word:
                if cogid > 0:
                    cogid = cogid + cognate_counter

                # append cognate sets, essential for raising the counter
                cognate_sets += [int(cogid)]

                taxon = cleant(header[i])

                D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid]
                idx += 1

    # re-iterate through data and reassign cognate sets with negative ids
    for k in D:
        if k:
            cogid = D[k][-1]
            if cogid < 0:
                cogid = -cognate_counter
                cognate_counter += 1
                D[k][-1] = cogid

    return D
Ejemplo n.º 12
0
    def _output(self, fileformat, **keywords):
        """
        Internal function that eases its modification by daughter classes.
        """
        # check for stamp attribute
        keywords["stamp"] = getattr(self, '_stamp', '')

        # add the default parameters, they will be checked against the keywords
        util.setdefaults(
            keywords,
            cols=False,
            distances=False,
            entries=("concept", "counterpart"),
            entry='concept',
            fileformat=fileformat,
            filename=rcParams['filename'],
            formatter='concept',
            modify_ref=False,
            meta=self._meta,
            missing=0,
            prettify='false',
            ignore='all',
            ref='cogid',
            rows=False,
            subset=False,  # setup a subset of the data,
            taxa='taxa',
            threshold=0.6,  # threshold for flat clustering
            tree_calc='neighbor')

        if fileformat in ['triple', 'triples', 'triples.tsv']:
            return tsv2triple(self, keywords['filename'] + '.' + fileformat)

        if fileformat in ['paps.nex', 'paps.csv']:
            paps = self.get_paps(
                ref=keywords['ref'], entry=keywords['entry'], missing=keywords['missing'])
            kw = dict(filename=keywords['filename'] + '.paps')
            if fileformat == 'paps.nex':
                kw['missing'] = keywords['missing']
                return pap2nex(self.cols, paps, **kw)
            return pap2csv(self.cols, paps, **kw)

        # simple printing of taxa
        if fileformat == 'taxa':
            assert hasattr(self, 'taxa')
            return util.write_text_file(keywords['filename'] + '.taxa', self.cols)

        # csv-output
        if fileformat in ['csv', 'qlc', 'tsv']:

            # get the header line
            header = sorted(
                [s for s in set(self._alias.values()) if s in self._header],
                key=lambda x: self._header[x])
            header = [h.upper() for h in header]

            self._meta.setdefault('taxa', self.cols)

            # get the data, in case a subset is chosen
            if not keywords['subset']:
                # write stuff to file
                return wl2qlc(header, self._data, **keywords)

            cols, rows = keywords['cols'], keywords['rows']

            if not isinstance(cols, (list, tuple, bool)):
                raise ValueError("[i] Argument 'cols' should be list or tuple.")
            if not isinstance(rows, (dict, bool)):
                raise ValueError("[i] Argument 'rows' should be a dictionary.")

            # check for chosen header
            if cols:
                # get indices for header
                indices = [self._header[x] for x in cols]
                header = [c.upper() for c in cols]
            else:
                indices = [r for r in range(len(self.header))]

            if rows:
                stmts = []
                for key, value in rows.items():
                    if key == 'ID':
                        stmts += ["key " + value]
                    else:
                        idx = self._header[key]
                        stmts += ["line[{0}] ".format(idx) + value]

            log.debug("calculated what should be excluded")

            # get the data
            out = {}
            for key, line in self._data.items():
                log.debug(key)

                if rows:
                    if eval(" and ".join(stmts)):
                        out[key] = [line[i] for i in indices]
                else:
                    out[key] = [line[i] for i in indices]

            log.debug("passing data to wl2qlc")
            return wl2qlc(header, out, **keywords)

        # output dst-format (phylip)
        if fileformat == 'dst':
            # check for distances as keyword
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self, **keywords)

            out = matrix2dst(self._meta['distances'], self.taxa,
                    stamp=keywords['stamp'], taxlen=keywords.get('taxlen', 0))
            return _write_file(keywords['filename'], out, fileformat)

        # output tre-format (newick)
        if fileformat in ['tre', 'nwk']:  # ,'cluster','groups']:
            if 'tree' not in self._meta:
                # check for distances
                if 'distances' not in self._meta:
                    self._meta['distances'] = wl2dst(self)
                # we look up a function to calculate a tree in the cluster module:
                tree = getattr(cluster, keywords['tree_calc'])(
                    self._meta['distances'], self.cols, distances=keywords['distances'])
            else:
                tree = self._meta['tree']

            return _write_file(keywords['filename'], '{0}'.format(tree), fileformat)

        if fileformat in ['cluster', 'groups']:
            if 'distances' not in self._meta:
                self._meta['distances'] = wl2dst(self)  # check for keywords

            if 'groups' not in self._meta:
                self._meta['groups'] = cluster.matrix2groups(
                    keywords['threshold'], self._meta['distances'], self.taxa)
            lines = []
            for taxon, group in sorted(self._meta['groups'].items(), key=lambda x: x[0]):
                lines.append('{0}\t{1}'.format(taxon, group))
            return _write_file(keywords['filename'], lines, fileformat)

        if fileformat in ['starling', 'star.csv']:
            # make lambda inline for data-check
            l = lambda x: ['-' if x == 0 else x][0]

            lines = []
            if 'cognates' not in keywords:
                lines.append('ID\tConcept\t' + '\t'.join(self.taxa))
                for i, concept in enumerate(self.concepts):
                    for line in self.get_list(row=concept, entry=keywords['entry']):
                        lines.append(
                            str(i + 1) + '\t' + concept + '\t' + '\t'.join(
                                [l(t) for t in line]))
            else:
                lines.append(
                    'ID\tConcept\t' + '\t'.join(
                        ['{0}\t COG'.format(t) for t in self.taxa]))
                for i, concept in enumerate(self.concepts):
                    cogs = self.get_list(row=concept, entry=keywords['cognates'])
                    for j, line in enumerate(
                            self.get_list(row=concept, entry=keywords['entry'])):
                        part = '\t'.join(
                            '{0}\t{1}'.format(l(a), b) for a, b in zip(line, cogs[j]))
                        lines.append(util.tabjoin(i + 1, concept, part))

            return _write_file(
                keywords['filename'], lines, 'starling_' + keywords['entry'] + '.csv')

        if fileformat == 'multistate.nex':
            if not keywords['filename'].endswith('.multistate.nex'):
                keywords['filename'] += '.multistate.nex'

            matrix = wl2multistate(self, keywords['ref'], keywords['missing'])
            return multistate2nex(self.taxa, matrix, keywords['filename'])

        if fileformat == 'separated':
            if not os.path.isdir(keywords['filename']):
                os.mkdir(keywords['filename'])

            for l in self.cols:
                lines = [''] if 'ignore_keys' in keywords else ['ID\t']
                lines[0] += '\t'.join(x.upper() for x in keywords['entries'])
                for key in self.get_list(col=l, flat=True):
                    line = [] if 'ignore_keys' in keywords else [key]
                    for entry in keywords['entries']:
                        tmp = self[key, entry]
                        if isinstance(tmp, list):
                            tmp = ' '.join([str(x) for x in tmp])
                        line += [tmp]
                    lines.append('\t'.join('{0}'.format(x) for x in line))
                _write_file('{0}/{1}'.format(keywords['filename'], l), lines, 'tsv')
Ejemplo n.º 13
0
def mcl(
        threshold,
        matrix,
        taxa,
        max_steps=1000,
        inflation=2,
        expansion=2,
        add_self_loops=True,
        revert=False,
        logs=True,
        matrix_type="distances"):
    """
    Carry out a clustering using the MCL algorithm (:evobib:`Dongen2000`).

    Parameters
    ----------
    threshold : {float, bool}
        The threshold that shall be used for the initial selection of links
        assigned to the data. If set to c{False}, the weights from the matrix
        will be used directly.

    matrix : list
        A two-dimensional list containing the distances.

    taxa : list
        An list containing the names of all taxa corresponding to the distances
        in the matrix.

    max_steps : int (default=1000)
        Maximal number of iterations.

    inflation : int (default=2)
        Inflation parameter for the MCL algorithm.

    expansion : int (default=2)
        Expansion parameter of the MCL algorithm.

    add_self_loops : {True, False, builtins.function} (default=True)
        Determine whether self-loops should be added, and if so, how they
        should be weighted. If a function for the calculation of self-loops is
        given, it will take the whole column of the matrix for each taxon as
        input.

    logs : { bool, function } (default=True)
        If set to c{True}, the logarithm of the score beyond the threshold will
        be assigned as weight to the graph. If set to c{False} all weights will
        be set to 1. Use a custom function to define individual ways to
        calculate the weights.

    matrix_type : { "distances", "similarities" }
        Specify the type of the matrix. If the matrix contains distance data,
        it will be adapted to similarity data. If it contains "similarities",
        no adaptation is needed.

    Examples
    --------

    The function is automatically imported along with LingPy.

    >>> from lingpy import *
    >>> from lingpy.algorithm import squareform

    Create a list of arbitrary taxa.

    >>> taxa = ['German','Swedish','Icelandic','English','Dutch']

    Create an arbitrary distance matrix.

    >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3])
    >>> matrix
    [[0.0, 0.5, 0.67, 0.8, 0.2],
     [0.5, 0.0, 0.4, 0.7, 0.6],
     [0.67, 0.4, 0.0, 0.8, 0.8],
     [0.8, 0.7, 0.8, 0.0, 0.3],
     [0.2, 0.6, 0.8, 0.3, 0.0]]

    Carry out the link-clustering analysis.

    >>> mcl(0.5,matrix,taxa)
    {1: ['German', 'English', 'Dutch'], 2: ['Swedish', 'Icelandic']}

    """
    # check for type of matrix
    if type(matrix) != np.ndarray:
        imatrix = np.array(matrix)
    else:
        imatrix = matrix.copy()

    # check for matrix type and decide how to handle logs
    if matrix_type == 'distances':
        evaluate = lambda x: True if x < threshold else False
        if logs == True:
            logs = lambda x: -np.log2((1 - x)**2)
        elif logs == False:
            logs = lambda x: x
    elif matrix_type == 'similarities':
        evaluate = lambda x: True if x > threshold else False
        if logs == True:
            logs = lambda x: -np.log(x**2)
        else:
            logs = lambda x: x
    else:
        raise ValueError(matrix_type)

    # check for threshold
    if threshold:
        for i, j in combinations(range(len(imatrix)), 2):
            score = imatrix[i][j]
            evaluation = logs(score) if evaluate(score) else 0
            imatrix[i][j] = evaluation
            imatrix[j][i] = evaluation

    # check for self_loops
    if add_self_loops == True:
        for i in range(len(imatrix)):
            imatrix[i][i] = 1
    elif add_self_loops == False:
        pass
    else:
        for i in range(len(imatrix)):
            imatrix[i][i] = add_self_loops(imatrix[:, i])

    # normalize the matrix
    imatrix = _normalize_matrix(imatrix)

    # start looping and the like
    steps = 0
    while True:
        # expansion
        imatrix = np.linalg.matrix_power(imatrix, expansion)

        # inflation
        imatrix = imatrix ** inflation

        # normalization
        imatrix = _normalize_matrix(imatrix)

        # increase steps
        steps += 1

        # check for matrix convergence
        if steps >= max_steps or _is_idempotent(imatrix):
            log.debug("Number of steps {0}.".format(steps))
            break

    # retrieve the clusters
    clusters = _interprete_matrix(imatrix)

    # modify clusters
    if revert:
        return dict(zip(range(len(taxa)), clusters))

    clr = defaultdict(list)
    for i, t in enumerate(taxa):
        clr[clusters[i]].append(t)

    return clr
Ejemplo n.º 14
0
def compile_model(model, path=None):
    """
    Function compiles customized sound-class models.

    Parameters
    ----------

    model : str
        A string indicating the name of the model which shall be created.

    path : str
        A string indication the path where the model-folder is stored.

    Notes
    -----
    A model is defined by a folder placed in :file:`data/models` directory of
    the LingPy package. The name of the folder reflects the name of the model.
    It contains three files: the file :file:`converter`, the file :file:`INFO`,
    and the optional file :file:`scorer`. The format requirements for these
    files are as follows:

    :file:`INFO`
        The ``INFO``-file serves as a reference for a given sound-class model.
        It can contain arbitrary information (and also be empty). If one wants
        to define specific characteristics, like the ``source``, the
        ``compiler``, the ``date``, or a ``description`` of a given model,
        this can be done by employing a key-value structure in which the key is
        preceded by an ``@`` and followed by a colon and the value is written
        right next to the key in the same line, e.g.::
            
            @source: Dolgopolsky (1986)

        This information will then be read from the ``INFO`` file and rendered
        when printing the model to screen with help of the :py:func:`print`
        function.

    :file:`converter`
        The ``converter`` file contains all sound classes which are matched
        with their respective sound values. Each line is reserved for one
        class, precede by the key (preferably an ASCII-letter) representing the
        class::

            B : ɸ, β, f, p͡f, p͜f, ƀ
            E : ɛ, æ, ɜ, ɐ, ʌ, e, ᴇ, ə, ɘ, ɤ, è, é, ē, ě, ê, ɚ
            D : θ, ð, ŧ, þ, đ
            G : x, ɣ, χ
            ...
    
    :file:`matrix`
        A scoring matrix indicating the alignment scores of all sound-class
        characters defined by the model. The scoring is structured as a simple
        tab-delimited text file. The first cell contains the character names,
        the following cells contain the scores in redundant form (with both
        triangles being filled)::

            B  10.0 -10.0   5.0 ...
            E -10.0   5.0 -10.0 ...
            F   5.0 -10.0  10.0 ...
            ...

    :file:`scorer`
        The ``scorer`` file (which is optional) contains the graph of
        class-transitions which is used for the calculation of the scoring
        dictionary. Each class is listed in a separate line, followed by the
        symbols ``v``,``c``, or ``t`` (indicating whether the class
        represents vowels, consonants, or tones), and by the classes it is
        directly connected to. The strength of this connection is indicated by
        digits (the smaller the value, the shorter the path between the
        classes)::

            A : v, E:1, O:1
            C : c, S:2
            B : c, W:2
            E : v, A:1, I:1
            D : c, S:2
            ...
        
        The information in such a file is automatically converted into a
        scoring dictionary (see :evobib:`List2012b` for details).

    Based on the information provided by the files, a dictionary for the
    conversion of IPA-characters to sound classes and a scoring dictionary are
    created and stored as a binary.  The model can be loaded with help of the
    :py:class:`~lingpy.data.model.Model` class and used in the various classes
    and functions provided by the library.
    
    See also
    --------
    lingpy.data.model.Model
    compile_dvt

    """
    log.info("Compiling model <" + model + ">...")
    # get the path to the models
    new_path = lambda *cmps: os.path.join(path or util.data_path('models'), model, *cmps)

    log.debug("Model-Path: %s" % new_path)

    # load the sound classes
    sound_classes = _import_sound_classes(new_path('converter'))

    # dump the data
    cache.dump(sound_classes, model + '.converter')
    log.info("... successfully created the converter.")

    # try to load the scoring function or the score tree
    scorer = False

    if os.path.isfile(new_path('matrix')):
        scorer = read_scorer(new_path('matrix'))
    elif os.path.isfile(new_path('scorer')):
        score_tree = _import_score_tree(new_path('scorer'))

        # calculate the scoring dictionary
        score_dict = _make_scoring_dictionary(score_tree)

        # make score_dict a ScoreDict instance
        chars = sorted(set([s[0] for s in score_dict.keys()]))
        matrix = [[0 for i in range(len(chars))] for j in
                  range(len(chars))]
        for (i, charA), (j, charB) in util.multicombinations2(enumerate(chars)):
            if i < j:
                matrix[i][j] = score_dict.get((charA, charB), -100)
                matrix[j][i] = score_dict.get((charB, charA), -100)
            elif i == j:
                matrix[i][j] = score_dict[charA, charB]

        scorer = misc.ScoreDict(chars, matrix)
        util.write_text_file(new_path('matrix'), scorer2str(scorer))

    if scorer:
        cache.dump(scorer, model + '.scorer')
        log.info("... successfully created the scorer.")
    else:
        log.info("... no scoring dictionary defined.")

    log.info("Model <" + model + "> was compiled successfully.")
Ejemplo n.º 15
0
def mcl(threshold,
        matrix,
        taxa,
        max_steps=1000,
        inflation=2,
        expansion=2,
        add_self_loops=True,
        revert=False,
        logs=True,
        matrix_type="distances"):
    """
    Carry out a clustering using the MCL algorithm (:evobib:`Dongen2000`).

    Parameters
    ----------
    threshold : {float, bool}
        The threshold that shall be used for the initial selection of links
        assigned to the data. If set to c{False}, the weights from the matrix
        will be used directly.

    matrix : list
        A two-dimensional list containing the distances.

    taxa : list
        An list containing the names of all taxa corresponding to the distances
        in the matrix.

    max_steps : int (default=1000)
        Maximal number of iterations.

    inflation : int (default=2)
        Inflation parameter for the MCL algorithm.

    expansion : int (default=2)
        Expansion parameter of the MCL algorithm.

    add_self_loops : {True, False, builtins.function} (default=True)
        Determine whether self-loops should be added, and if so, how they
        should be weighted. If a function for the calculation of self-loops is
        given, it will take the whole column of the matrix for each taxon as
        input.

    logs : { bool, function } (default=True)
        If set to c{True}, the logarithm of the score beyond the threshold will
        be assigned as weight to the graph. If set to c{False} all weights will
        be set to 1. Use a custom function to define individual ways to
        calculate the weights.

    matrix_type : { "distances", "similarities" }
        Specify the type of the matrix. If the matrix contains distance data,
        it will be adapted to similarity data. If it contains "similarities",
        no adaptation is needed.

    Examples
    --------

    The function is automatically imported along with LingPy.

    >>> from lingpy import *
    >>> from lingpy.algorithm import squareform

    Create a list of arbitrary taxa.

    >>> taxa = ['German','Swedish','Icelandic','English','Dutch']

    Create an arbitrary distance matrix.

    >>> matrix = squareform([0.5,0.67,0.8,0.2,0.4,0.7,0.6,0.8,0.8,0.3])
    >>> matrix
    [[0.0, 0.5, 0.67, 0.8, 0.2],
     [0.5, 0.0, 0.4, 0.7, 0.6],
     [0.67, 0.4, 0.0, 0.8, 0.8],
     [0.8, 0.7, 0.8, 0.0, 0.3],
     [0.2, 0.6, 0.8, 0.3, 0.0]]

    Carry out the link-clustering analysis.

    >>> mcl(0.5,matrix,taxa)
    {1: ['German', 'English', 'Dutch'], 2: ['Swedish', 'Icelandic']}

    """
    # check for type of matrix
    if type(matrix) != np.ndarray:
        imatrix = np.array(matrix)
    else:
        imatrix = matrix.copy()

    # check for matrix type and decide how to handle logs
    if matrix_type == 'distances':
        evaluate = lambda x: True if x < threshold else False
        if logs == True:
            logs = lambda x: -np.log2((1 - x)**2)
        elif logs == False:
            logs = lambda x: x
    elif matrix_type == 'similarities':
        evaluate = lambda x: True if x > threshold else False
        if logs == True:
            logs = lambda x: -np.log(x**2)
        else:
            logs = lambda x: x
    else:
        raise ValueError(matrix_type)

    # check for threshold
    if threshold:
        for i, j in util.combinations2(range(len(imatrix))):
            score = imatrix[i][j]
            evaluation = logs(score) if evaluate(score) else 0
            imatrix[i][j] = evaluation
            imatrix[j][i] = evaluation

    # check for self_loops
    if add_self_loops == True:
        for i in range(len(imatrix)):
            imatrix[i][i] = 1
    elif add_self_loops == False:
        pass
    else:
        for i in range(len(imatrix)):
            imatrix[i][i] = add_self_loops(imatrix[:, i])

    # normalize the matrix
    imatrix = _normalize_matrix(imatrix)

    # start looping and the like
    steps = 0
    while True:
        # expansion
        imatrix = np.linalg.matrix_power(imatrix, expansion)

        # inflation
        imatrix = imatrix**inflation

        # normalization
        imatrix = _normalize_matrix(imatrix)

        # increase steps
        steps += 1

        # check for matrix convergence
        if steps >= max_steps or _is_idempotent(imatrix):
            log.debug("Number of steps {0}.".format(steps))
            break

    # retrieve the clusters
    clusters = _interprete_matrix(imatrix)

    # modify clusters
    if revert:
        return dict(zip(range(len(taxa)), clusters))

    clr = defaultdict(list)
    for i, t in enumerate(taxa):
        clr[clusters[i]].append(t)

    return clr
Ejemplo n.º 16
0
def star2qlc(filename, clean_taxnames=False, debug=False):
    """
    Converts a file directly output from starling to LingPy-QLC format.
    """
    cleant = clean_taxnames or identity
    data = csv2list(filename)

    # check for strange chars in data due to notepad errors
    data[0][0] = data[0][0].replace('\ufeff', '')

    # get the header
    header = data[0]

    # debugging
    if debug:
        error = False
        log.info("Header line has length {0}.".format(len(header)))
        for line in data[1:]:
            if len(line) != len(header):  # pragma: no cover
                log.error("Error for item {0} with length {1}, expected {2}.".format(
                    '/'.join(line[0:2]), len(line), len(header)))
                error = True
        if error:  # pragma: no cover
            log.error("Errors were found, aborting function call.")
            return
        else:
            log.info("Everything went fine, carrying on with function call.")

    # determine language names in header
    taxa = []
    for i in range(len(header) - 1):
        prev = header[i]
        post = header[i + 1]

        if prev in post and '#' in post:
            taxa += [prev]

            if len(taxa) == 1:
                lngIdx = i

        if prev == 'Number':
            numIdx = i

        if prev == 'Word':
            wrdIdx = i

    log.info('starling, indices (%s, %s, %s)' % (lngIdx, numIdx, wrdIdx))
    log.info('starling, taxa: %s' % taxa)

    # start filling in the dictionary
    D = {0: [
        'DOCULECT', 'CONCEPT', 'GLOSSID', 'WORDINSOURCE', 'ORTHOGRAPHY', 'IPA', 'COGID']}

    idx = 1
    cognate_counter = 0
    current_concept = ''
    cognate_sets = []
    for line in data[2:]:
        gloss = line[wrdIdx]
        gnum = line[numIdx]

        # switch to next cognate set if there is a switch in concepts
        if current_concept != gloss and len(cognate_sets) != 0:
            max_cog = max(cognate_sets)
            cognate_counter = max_cog
            cognate_sets = []
            current_concept = gloss
        else:
            log.debug('starling, indices (%s, %s, %s)' % (
                gloss, current_concept, cognate_counter))

        for i in range(lngIdx, len(header), 2):
            word = line[i]

            if '{' in word:
                ipa = word[:word.index('{')].strip()
                ortho = word[word.index('{') + 1:word.index('}')].strip()
            else:
                ipa = word
                ortho = word

            cogid = int(line[i + 1])

            if cogid != 0 and word:
                if cogid > 0:
                    cogid = cogid + cognate_counter

                # append cognate sets, essential for raising the counter
                cognate_sets += [int(cogid)]

                taxon = cleant(header[i])

                D[idx] = [taxon, gloss, gnum, word, ortho, ipa, cogid]
                idx += 1

    # re-iterate through data and reassign cognate sets with negative ids
    for k in D:
        if k:
            cogid = D[k][-1]
            if cogid < 0:
                cogid = -cognate_counter
                cognate_counter += 1
                D[k][-1] = cogid

    return D