Beispiel #1
0
def reduce_alignment(alignment):
    """
    Function reduces a given alignment.
    
    Notes
    -----
    Reduction here means that the output alignment consists only of those parts
    which have not been marked to be ignored by the user (parts in brackets).
    It requires that all data is properly coded. If reduction fails, this will
    throw a warning, and all brackets are simply removed in the output
    alignment.
    """

    # check for bracket indices in all columns
    cols = misc.transpose(alignment)

    ignore_indices = []
    ignore = False
    for i, col in enumerate(cols):
        reduced_col = sorted(set(col))

        if '(' in reduced_col:
            if len(reduced_col) == 1:
                ignore_indices += [i]
                ignore = True
            else:
                ignore = False
        elif ')' in reduced_col:
            if len(reduced_col) == 1:
                ignore_indices += [i]
                ignore = False
            else:
                ignore_indices = []
        elif ignore:
            ignore_indices += [i]

    if ignore_indices:
        new_cols = []
        for i, col in enumerate(cols):
            if i not in ignore_indices:
                new_cols += [col]
    else:
        new_cols = cols

    new_alm = misc.transpose(new_cols)

    for i, alm in enumerate(new_alm):
        for j, char in enumerate(alm):
            if char in '()':
                new_alm[i][j] = '-'

    return new_alm
Beispiel #2
0
def reduce_alignment(alignment):
    """
    Function reduces a given alignment.
    
    Notes
    -----
    Reduction here means that the output alignment consists only of those parts
    which have not been marked to be ignored by the user (parts in brackets).
    It requires that all data is properly coded. If reduction fails, this will
    throw a warning, and all brackets are simply removed in the output
    alignment.
    """

    # check for bracket indices in all columns
    cols = misc.transpose(alignment)

    ignore_indices = []
    ignore = False
    for i, col in enumerate(cols):
        reduced_col = sorted(set(col))

        if '(' in reduced_col:
            if len(reduced_col) == 1:
                ignore_indices += [i]
                ignore = True
            else:
                ignore = False
        elif ')' in reduced_col:
            if len(reduced_col) == 1:
                ignore_indices += [i]
                ignore = False
            else:
                ignore_indices = []
        elif ignore:
            ignore_indices += [i]

    if ignore_indices:
        new_cols = []
        for i, col in enumerate(cols):
            if i not in ignore_indices:
                new_cols += [col]
    else:
        new_cols = cols

    new_alm = misc.transpose(new_cols)

    for i, alm in enumerate(new_alm):
        for j, char in enumerate(alm):
            if char in '()':
                new_alm[i][j] = '-'

    return new_alm
Beispiel #3
0
    def c_scores(self):
        """
        Calculate the c-scores.
        """
        almsGold = misc.transpose(self.gold.alm_matrix)
        almsTest = misc.transpose(self.test.alm_matrix)

        commons = len([i for i in almsGold if i in almsTest])

        cp = commons / len(almsTest)
        cr = commons / len(almsGold)
        c_ = 2 * commons / (len(almsTest) + len(almsGold))
        try:
            cf = 2 * cp * cr / (cp + cr)
        except ZeroDivisionError:
            cf = 0.0
        return namedtuple('Scores', 'cp cr c_ cf')(cp, cr, c_, cf)
Beispiel #4
0
    def c_scores(self):
        """
        Calculate the c-scores.
        """
        almsGold = misc.transpose(self.gold.alm_matrix)
        almsTest = misc.transpose(self.test.alm_matrix)

        commons = len([i for i in almsGold if i in almsTest])

        cp = commons / len(almsTest)
        cr = commons / len(almsGold)
        c_ = 2 * commons / (len(almsTest) + len(almsGold))
        try:
            cf = 2 * cp * cr / (cp + cr)
        except ZeroDivisionError:
            cf = 0.0
        return namedtuple('Scores', 'cp cr c_ cf')(cp, cr, c_, cf)
Beispiel #5
0
def wl2multistate(wordlist, ref, missing):
    """
    Function converts a wordlist to multistate format (compatible with PAUP).
    """

    # convert the data to a multistate matrix
    # get etymological dictionary
    wordlist.get_etymdict(ref=ref)

    # define chars, we only have a limited set, unfortunately
    chars = ascii_letters + digits

    # iterate over all cognate sets and assign the chars
    matrix = []
    for c in wordlist.concepts:
        taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref)

        distinct_states = set()
        for taxon in wordlist.taxa:
            distinct_states.update(taxon_to_cognate_set.get(taxon, [0]))

        # make converter
        if len(distinct_states) > len(chars):  # pragma: no cover
            # FIXME: This shouldn't just be a warning, because we
            # will get a KeyError
            # down below, since zip just returns a list of length len(chars)!
            log.warning('more distinct states than available characters!')
        char_map = dict(zip(sorted(distinct_states), chars))
        char_map['-'] = '-'

        line = []
        for taxon in wordlist.taxa:
            states = set(taxon_to_cognate_set.get(taxon, ['-']))
            # exclude the case len(taxon_to_cognate_set[taxon]) == 0
            if len(states) == 1:
                line.append(char_map[states.pop()])
            elif not states:
                line.append(missing)
            else:
                line.append('({0})'.format("".join(
                    [char_map[x] for x in sorted(states)])))

        matrix.append(line)

    return misc.transpose(matrix)
Beispiel #6
0
def wl2multistate(wordlist, ref, missing):
    """
    Function converts a wordlist to multistate format (compatible with PAUP).
    """

    # convert the data to a multistate matrix
    # get etymological dictionary
    wordlist.get_etymdict(ref=ref)

    # define chars, we only have a limited set, unfortunately
    chars = ascii_letters + digits

    # iterate over all cognate sets and assign the chars
    matrix = []
    for c in wordlist.concepts:
        taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref)

        distinct_states = set()
        for taxon in wordlist.taxa:
            distinct_states.update(taxon_to_cognate_set.get(taxon, [0]))

        # make converter
        if len(distinct_states) > len(chars):  # pragma: no cover
            # FIXME: This shouldn't just be a warning, because we
            # will get a KeyError
            # down below, since zip just returns a list of length len(chars)!
            log.warning('more distinct states than available characters!')
        char_map = dict(zip(sorted(distinct_states), chars))
        char_map['-'] = '-'

        line = []
        for taxon in wordlist.taxa:
            states = set(taxon_to_cognate_set.get(taxon, ['-']))
            # exclude the case len(taxon_to_cognate_set[taxon]) == 0
            if len(states) == 1:
                line.append(char_map[states.pop()])
            elif not states:
                line.append(missing)
            else:
                line.append('({0})'.format(
                    "".join([char_map[x] for x in sorted(states)])))

        matrix.append(line)

    return misc.transpose(matrix)
Beispiel #7
0
def normalize_alignment(alignment):
    """
    Function normalizes an alignment.

    Normalization here means that columns consisting only of gaps will be
    deleted, and all sequences will be stretched to equal length by adding
    additional gap characters in the end of smaller sequences.
    """
    # clone the alignment
    alm_clone = [[x for x in y] for y in alignment]

    # first check for alms of different length
    alm_lens = [len(alm) for alm in alm_clone]
    if alm_lens.count(1) == len(alm_lens):
        for i, alm in enumerate(alm_clone):
            alm_clone[i] = alm[0].split(' ')
            alm_lens[i] = len(alm_clone[i])

    if len(set(alm_lens)) > 1:
        max_len = max(alm_lens)
        for i, alm in enumerate(alm_clone):
            new_alm = alm + ['-' for x in range(max_len)]
            alm_clone[i] = new_alm[:max_len]

    # then check for alms consisting only of gaps
    cols = misc.transpose(alm_clone)
    idxs = []
    for i, col in enumerate(cols):
        if set(col) == set('-'):
            idxs += [i]
    for idx in idxs[::-1]:
        for i, alm in enumerate(alm_clone):
            del alm_clone[i][idx]
    if alignment != alm_clone:
        lgtxt = 'Modified the alignment:\n'
        for i in range(len(alignment)):
            lgtxt += '[!] ' + ' '.join(alignment[i]) + '->'
            lgtxt += ' '.join(alm_clone[i]) + '\n'
        log.debug(lgtxt)
        return alm_clone
    else:
        return alignment
Beispiel #8
0
def normalize_alignment(alignment):
    """
    Function normalizes an alignment.

    Normalization here means that columns consisting only of gaps will be
    deleted, and all sequences will be stretched to equal length by adding
    additional gap characters in the end of smaller sequences.
    """
    # clone the alignment
    alm_clone = [[x for x in y] for y in alignment]

    # first check for alms of different length
    alm_lens = [len(alm) for alm in alm_clone]
    if alm_lens.count(1) == len(alm_lens):
        for i, alm in enumerate(alm_clone):
            alm_clone[i] = alm[0].split(' ')
            alm_lens[i] = len(alm_clone[i])

    if len(set(alm_lens)) > 1:
        max_len = max(alm_lens)
        for i, alm in enumerate(alm_clone):
            new_alm = alm + ['-' for x in range(max_len)]
            alm_clone[i] = new_alm[:max_len]

    # then check for alms consisting only of gaps
    cols = misc.transpose(alm_clone)
    idxs = []
    for i, col in enumerate(cols):
        if set(col) == set('-'):
            idxs += [i]
    for idx in idxs[::-1]:
        for i, alm in enumerate(alm_clone):
            del alm_clone[i][idx]
    if alignment != alm_clone:
        lgtxt = 'Modified the alignment:\n'
        for i in range(len(alignment)):
            lgtxt += '[!] ' + ' '.join(alignment[i]) + '->'
            lgtxt += ' '.join(alm_clone[i]) + '\n'
        debug(lgtxt)
        return alm_clone
    else:
        return alignment