Esempio n. 1
0
def _get_brackets(brackets):

    out = defaultdict(str)
    for b in brackets:
        out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT'))
        if b == out[b]:
            log.warn('lingpy.sequence.sound_classes.get_brackets' + \
                    'Item «{0}» does not have a counterpart!'.format(b))
    return out
Esempio n. 2
0
def _get_brackets(brackets):

    out = defaultdict(str)
    for b in brackets:
        out[b] = unicodedata.lookup(unicodedata.name(b).replace('LEFT', 'RIGHT'))
        if b == out[b]:
            log.warn('lingpy.sequence.sound_classes.get_brackets' + \
                    'Item «{0}» does not have a counterpart!'.format(b))
    return out
Esempio n. 3
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Esempio n. 4
0
    def test_convenience(self):
        from lingpy.log import info, warn, debug, error, deprecated, missing_module, file_written

        info('m')
        warn('m')
        debug('m')
        error('m')
        deprecated('o', 'n')
        missing_module('m')
        file_written('f')
Esempio n. 5
0
def string2html(
    taxon,
    string,
    swaps=[],
    tax_len=None
    ):
    """
    Function converts an (aligned) string into colored html-format.
    
    @deprecated
    """

    # determine the length of the string
    if not tax_len:
        tax_len = len(taxon)

    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # set the td_taxon-line
    td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = ''
    out += td_taxon.format(taxon)

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warn("Unknown character '" + char + "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out
Esempio n. 6
0
def string2html(taxon, string, swaps=[], tax_len=None):
    """
    Function converts an (aligned) string into colored html-format.
    
    @deprecated
    """

    # determine the length of the string
    if not tax_len:
        tax_len = len(taxon)

    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # set the td_taxon-line
    td_taxon = '<td class="taxon" width="' + str(15 * tax_len) + '">{0}</td>\n'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = ''
    out += td_taxon.format(taxon)

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warn("Unknown character '" + char +
                         "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out
Esempio n. 7
0
def tokens2html(
    string,
    swaps=[],
    tax_len=None,
):
    """
    Function converts an (aligned) string into colored html-format.

    Notes
    -----
    This function is currently not used by any other program. So it might be
    useful to just deprecate it.

    @deprecated
    """
    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = '<table>'

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warn("Unknown character '" + char +
                         "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out + '</table>'
Esempio n. 8
0
def tokens2html(
    string,
    swaps=[],
    tax_len=None,
):
    """
    Function converts an (aligned) string into colored html-format.

    Notes
    -----
    This function is currently not used by any other program. So it might be
    useful to just deprecate it.

    @deprecated
    """
    # set the tr-line
    tr = '<tr class="msa">\n{0}\n</tr>'

    # get the percentage scaling factor
    perc = int(80 / len(string) + 0.5)

    # get vals for residue and swaps
    td_residue = '<td class="residue" width="50" align="center" bgcolor="{1}">' + \
                 '<font color="{2}">{0}</font></td>\n'
    td_swap = '<td class="residue swap" style="border:solid 3px black" width="50"' + \
              'align="center" bgcolor="{1}"><font color="{2}">{0}</font></td>\n'

    # start with filling the taxon
    out = '<table>'

    # go on with the colors
    for i, char in enumerate(string):
        try:
            c = rcParams['_color'][char]
            fg = '#000000'
        except:
            try:
                c = rcParams['_color'][char[0]]
                fg = '#000000'
            except KeyError:
                log.warn("Unknown character '" + char + "', press ANY key to continue. ")
                c = '#ffffff'
                fg = '#eb3410'

        if i in swaps:
            out += td_swap.format(char, c, fg)
        else:
            out += td_residue.format(char, c, fg)

    return out + '</table>'
Esempio n. 9
0
def npoint_ap(scores, cognates, reverse=False):
    """
    Calculate the n-point average precision.
    
    Parameters
    ----------
    scores : list
        The scores of your algorithm for pairwise string comparison. 
    cognates : list
        The cognate codings of the word pairs you compared. 1 indicates that
        the pair is cognate, 0 indicates that it is not cognate.
    reverse : bool (default=False)
        The order of your ranking mechanism. If your algorithm yields high
        scores for words which are probably cognate, and low scores for
        non-cognate words, you should set this keyword to "True".

    Notes
    -----
    This follows the description in :evobib:`Kondrak2002`. The n-point average
    precision is useful to compare the discriminative force of different
    algorithms for string similarity, or to train the parameters of a given
    algorithm.

    Examples
    --------
    
    >>> scores = [1, 2, 3, 4, 5]
    >>> cognates = [1, 1, 1, 0, 0]
    >>> from lingpy.evaluate.acd import npoint_ap
    >>> npoint_ap(scores, cognates)
    1.0

    """
    p = 0.0
    cognate_count = 0
    for k, (score, cognate) in enumerate(
            sorted(zip(scores, cognates), key=lambda x: x[0],
                   reverse=reverse)):
        if cognate == 1:
            cognate_count += 1
            p += cognate_count / (k + 1.0)
    try:
        return p / cognates.count(1)
    except ZeroDivisionError:
        log.warn(
            "Encountered Zero Division in npoint_ap, your data seems to contain no cognates."
        )
        return 0
Esempio n. 10
0
def wl2multistate(wordlist, ref, missing):
    """
    Function converts a wordlist to multistate format (compatible with PAUP).
    """

    # convert the data to a multistate matrix
    # get etymological dictionary
    wordlist.get_etymdict(ref=ref)

    # define chars, we only have a limited set, unfortunately
    chars = ascii_letters + digits

    # iterate over all cognate sets and assign the chars
    matrix = []
    for c in wordlist.concepts:
        taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref)

        distinct_states = set()
        for taxon in wordlist.taxa:
            distinct_states.update(taxon_to_cognate_set.get(taxon, [0]))

        # make converter
        if len(distinct_states) > len(chars):  # pragma: no cover
            # FIXME: This shouldn't just be a warning, because we
            # will get a KeyError
            # down below, since zip just returns a list of length len(chars)!
            log.warn('more distinct states than available characters!')
        char_map = dict(zip(sorted(distinct_states), chars))
        char_map['-'] = '-'

        line = []
        for taxon in wordlist.taxa:
            states = set(taxon_to_cognate_set.get(taxon, ['-']))
            # exclude the case len(taxon_to_cognate_set[taxon]) == 0
            if len(states) == 1:
                line.append(char_map[states.pop()])
            elif not states:
                line.append(missing)
            else:
                line.append('({0})'.format("".join(
                    [char_map[x] for x in sorted(states)])))

        matrix.append(line)

    return misc.transpose(matrix)
Esempio n. 11
0
def npoint_ap(scores, cognates, reverse=False):
    """
    Calculate the n-point average precision.
    
    Parameters
    ----------
    scores : list
        The scores of your algorithm for pairwise string comparison. 
    cognates : list
        The cognate codings of the word pairs you compared. 1 indicates that
        the pair is cognate, 0 indicates that it is not cognate.
    reverse : bool (default=False)
        The order of your ranking mechanism. If your algorithm yields high
        scores for words which are probably cognate, and low scores for
        non-cognate words, you should set this keyword to "True".

    Notes
    -----
    This follows the description in :evobib:`Kondrak2002`. The n-point average
    precision is useful to compare the discriminative force of different
    algorithms for string similarity, or to train the parameters of a given
    algorithm.

    Examples
    --------
    
    >>> scores = [1, 2, 3, 4, 5]
    >>> cognates = [1, 1, 1, 0, 0]
    >>> from lingpy.evaluate.acd import npoint_ap
    >>> npoint_ap(scores, cognates)
    1.0

    """
    p = 0.0
    cognate_count = 0
    for k,(score, cognate) in enumerate(sorted(zip(scores, cognates),
        key=lambda x: x[0], reverse=reverse)):
        if cognate == 1:
            cognate_count += 1
            p += cognate_count / (k+1.0)
    try: 
        return p / cognates.count(1)
    except ZeroDivisionError:
        log.warn("Encountered Zero Division in npoint_ap, your data seems to contain no cognates.")
        return 0
Esempio n. 12
0
def wl2multistate(wordlist, ref, missing):
    """
    Helper function converts a wordlist to multistate format (compatible with PAUP).
    """

    # convert the data to a multistate matrix
    # get etymological dictionary
    wordlist.get_etymdict(ref=ref)

    # define chars, we only have a limited set, unfortunately
    chars = ascii_letters + digits

    # iterate over all cognate sets and assign the chars
    matrix = []
    for c in wordlist.concepts:
        taxon_to_cognate_set = wordlist.get_dict(concept=c, entry=ref)

        distinct_states = set()
        for taxon in wordlist.taxa:
            distinct_states.update(taxon_to_cognate_set.get(taxon, [0]))

        # make converter
        if len(distinct_states) > len(chars):  # pragma: no cover
            # FIXME: This shouldn't just be a warning, because we will get a KeyError
            # down below, since zip just returns a list of length len(chars)!
            log.warn('more distinct states than available characters!')
        char_map = dict(zip(sorted(distinct_states), chars))
        char_map['-'] = '-'

        line = []
        for taxon in wordlist.taxa:
            states = set(taxon_to_cognate_set.get(taxon, ['-']))
            #assert states  # exclude the case len(taxon_to_cognate_set[taxon]) == 0
            if len(states) == 1:
                line.append(char_map[states.pop()])
            elif not states:
                line.append(missing)
            else:
                line.append('({0})'.format(
                    "".join([char_map[x] for x in sorted(states)])))

        matrix.append(line)

    return misc.transpose(matrix)
Esempio n. 13
0
def normalize_alignment(alignment):
    """
    Function normalizes an alignment.

    Normalization here means that columns consisting only of gaps will be
    deleted, and all sequences will be stretched to equal length by adding
    additional gap characters in the end of smaller sequences.
    """
    # clone the alignment
    alm_clone = [[x for x in y] for y in alignment]

    # first check for alms of different length
    alm_lens = [len(alm) for alm in alm_clone]
    if alm_lens.count(1) == len(alm_lens):
        for i, alm in enumerate(alm_clone):
            alm_clone[i] = alm[0].split(' ')
            alm_lens[i] = len(alm_clone[i])

    if len(set(alm_lens)) > 1:
        max_len = max(alm_lens)
        for i, alm in enumerate(alm_clone):
            new_alm = alm + ['-' for x in range(max_len)]
            alm_clone[i] = new_alm[:max_len]

    # then check for alms consisting only of gaps
    cols = misc.transpose(alm_clone)
    idxs = []
    for i, col in enumerate(cols):
        if set(col) == set('-'):
            idxs += [i]
    for idx in idxs[::-1]:
        for i, alm in enumerate(alm_clone):
            del alm_clone[i][idx]
    if alignment != alm_clone:
        lgtxt = 'Modified the alignment:\n'
        for i in range(len(alignment)):
            lgtxt += '[!] ' + ' '.join(alignment[i]) + '->'
            lgtxt += ' '.join(alm_clone[i]) + '\n'
        warn(lgtxt)
        return alm_clone
    else:
        return alignment
Esempio n. 14
0
def context_profile(wordlist,
                    ref='ipa',
                    col="doculect",
                    semi_diacritics='hsʃ̢ɕʂʐʑʒw',
                    merge_vowels=False,
                    brackets=None,
                    splitters='/,;~',
                    merge_geminates=True,
                    clts=False,
                    bad_word="<???>",
                    bad_sound="<?>",
                    unknown_sound="!{0}",
                    examples=2,
                    max_entries=100):
    """
    Create an advanced Orthography Profile with context and doculect information.

    Parameters
    ----------
    wordlist : ~lingpy.basic.wordlist.Wordlist
        A wordlist from which you want to derive an initial
        orthography profile.
    ref : str (default="ipa")
        The name of the reference column in which the words are stored.
    col : str (default="doculect")
        Indicate in which column the information on the language variety is
        stored.
    semi_diacritics : str
        Indicate characters which can occur both as "diacritics" (second part
        in a sound) or alone.
    merge_vowels : bool (default=True)
        Indicate whether consecutive vowels should be merged.
    brackets : dict
        A dictionary with opening brackets as key and closing brackets as
        values. Defaults to a pre-defined set of frequently occurring brackets.
    splitters : str
        The characters which force the automatic splitting of an entry.
    clts : dict (default=None)
        A dictionary(like) object that converts a given source sound into a
        potential target sound, using the get()-method of the dictionary.
        Normally, we think of a CLTS instance here (that is: a cross-linguistic
        transcription system as defined in the pyclts package).
    bad_word : str (default="«???»")
        Indicate how words that could not be parsed should be handled. Note
        that both "bad_word" and "bad_sound" are format-strings, so you can add
        formatting information here.
    bad_sound : str (default="«?»")
        Indicate how sounds that could not be converted to a sound class be
        handled. Note that both "bad_word" and "bad_sound" are format-strings,
        so you can add formatting information here.
    unknown_sound : str (default="!{0}")
        If with_clts is set to True, use this string to indicate that sounds
        are classified as "unknown sound" in the CLTS framework.
    examples : int(default=2)
        Indicate the number of examples that should be printed out.

    Returns
    -------
    profile : generator
        A generator of tuples (three items), indicating the segment, its frequency,
        the conversion to sound classes in the Dolgopolsky sound-class model,
        and the unicode-codepoints.
    """
    clts_ = clts or {}
    nulls = set()
    bad_words = set()
    brackets = brackets or "([{『(₍⁽«)]})』⁾₎"
    profile = defaultdict(list)
    errors = set()
    for idx, word, language in pb(wordlist.iter_rows(ref, col),
                                  desc='iter words',
                                  total=len(wordlist)):
        log.info('processing {0}-{1}'.format(idx, word))
        if isinstance(word, list):
            word = ' '.join(word)
        if word.strip():
            try:
                cleaned_string = clean_string(
                    word,
                    semi_diacritics=semi_diacritics,
                    merge_vowels=merge_vowels,
                    brackets=None,
                    ignore_brackets=False,
                    split_entries=False,
                    preparse=None,
                    rules=None,
                    merge_geminates=merge_geminates)[0].split(' ')

                # retain whole word if there are splitters in the word
                if [x for x in cleaned_string if x in brackets + splitters]:
                    profile[word] += [(language, word)]
                    bad_words.add(word)
                else:
                    context_pre = ['^'] + (len(cleaned_string) - 1) * ['']
                    context_post = (len(cleaned_string) - 1) * [''] + ['$']
                    for ctxA, ctxB, segment in zip(context_pre, context_post,
                                                   cleaned_string):
                        profile[ctxA + segment + ctxB] += [(language, word)]
                    for segment in [
                            x for x in word
                            if x not in ' '.join(cleaned_string)
                    ]:
                        profile[segment] += [(language, word)]
                        nulls.add(segment)
            except:
                errors.add(idx)
                log.warn('problem parsing {0}'.format(word))

    for s in '^$':
        yield s, 'NULL', '', '', '', ''

    for idx, (s, entries) in pb(enumerate(
            sorted(profile.items(), key=lambda x: len(x[1]), reverse=True)),
                                desc='yielding entries',
                                total=len(profile)):
        sclass = token2class(s.strip('^$'), 'dolgo')
        words, langs = [l[1] for l in entries
                        ][:max_entries], [l[0] for l in entries][:max_entries]
        languages = ', '.join(
            sorted(set(langs), key=lambda x: langs.count(x), reverse=True))
        frequency = str(len(langs))
        codepoints = codepoint(s)
        examples_ = ', '.join(
            sorted(set(words), key=lambda x: words.count(x),
                   reverse=True)[:examples])
        if s in bad_words:
            ipa = bad_word.format(s)
        elif sclass == '0':
            ipa = bad_sound.format(s)
        elif s in nulls:
            ipa = 'NULL'
        elif clts_:
            sound = clts_.get(s.strip('^$'), False)
            if not sound:
                ipa = '!' + s.strip('^$')
            else:
                ipa = text_type(sound)
        else:
            ipa = s.strip('^$')

        yield s, ipa, examples_, languages, frequency, codepoints
Esempio n. 15
0
def plot_heatmap(
    wordlist,
    filename="heatmap",
    fileformat="pdf",
    ref='cogid',
    normalized=False,
    refB='',
    **keywords
):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90
    )
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False.")

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes(
            [
                keywords['left'],
                keywords['bottom'],
                0.25 * keywords['width'],
                keywords['height']
            ]
        )
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',

        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=ref
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=ref
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=ref
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=ref
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warn(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=refB
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=refB
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=refB
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=refB
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warn(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(
                        taxa=taxonA,
                        flat=True,
                        entry=ref
                    )
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes(
        [
            left,  # keywords['left']+0.25 * keywords['width']+0.05,
            keywords['bottom'],
            keywords['width'],
            keywords['height']
        ]
    )
    cmap = keywords['cmap'] 

    # [0.15,0.1,0.7,0.7])

    im = ax2.matshow(matrix, aspect='auto', origin='lower',
                     interpolation='nearest', cmap=keywords['cmap'],
                     vmax=keywords['vmax'], vmin=keywords['vmin']
                     )

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])



    plt.xticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
        rotation=keywords['xrotation'],
        rotation_mode="default"
    )
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:
        plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize'])

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)
Esempio n. 16
0
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True

            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict(filename._data.items())
            input_data.update(filename._meta.items())
            input_data[0] = [a for a, b in sorted(
                filename.header.items(),
                key=lambda x:x[1],
                reverse=False)]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError("Unrecognized type for 'filename' argument: {0}".format(
                type(filename).__name__))

        # load the configuration file
        if not conf:
            conf = util.data_path('conf', 'qlc.rc')

        # read the file defined by its path in conf
        tmp = [line.split('\t') for line in util.read_config_file(conf)]

        # define two attributes, _alias, and _class which store the aliases and
        # the datatypes (classes) of the given entries
        self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {}
        for name, cls, alias in tmp:
            # make sure the name itself is there
            self._alias[name.lower()] = name
            self._alias[name.upper()] = name

            self._class[name.lower()] = eval(cls)
            self._class[name.upper()] = eval(cls)

            self._class_string[name.lower()] = cls
            self._class_string[name.upper()] = cls

            # add the aliases
            for a in alias.split(','):
                self._alias[a.lower()] = name
                self._alias[a.upper()] = name
                self._class[a.lower()] = eval(cls)
                self._class[a.upper()] = eval(cls)
                self._class_string[a.lower()] = cls
                self._class_string[a.upper()] = cls

            self._alias2[name] = sorted(set(alias.split(','))) + [name]

        # append the names in data[0] to self.conf to make sure that all data
        # is covered, even the types which are not specifically defined in the
        # conf file. the datatype defaults here to "str"
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data
        # dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]], range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = dict([(k, v) for k, v in self.header.items()])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                idx = self._header[self._alias[alias]]
                self._header[alias] = idx
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v for k, v in input_data.items() if k != 0 and str(k).isnumeric()}

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        try:
                            self._data[key][i] = self._class[head](self._data[key][i])
                            check.append(i)
                        except:  # pragma: no cover
                            log.warn(
                                'Problem with row {0} in col {1}, expected' +
                                ' «{4}» as datatype but received «{3}» ' +
                                ' (ROW: {2}, entry {5}).'.format(
                                    key,
                                    i,
                                    '|'.join([str(x) for x in self._data[key]]),
                                    self._data[key][i],
                                    self._class[head],
                                    head))

        # create entry attribute of the wordlist
        self.entries = sorted(set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
Esempio n. 17
0
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        warn(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {
                    k: v[1:-1]
                    for k, v in [key.split('=') for key in tmp.split(' ')[1:]]
                }
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append(
                            [x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                if 'id' not in keys:
                    keys['id'] = 'basic'
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' +
                        str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
Esempio n. 18
0
    def __init__(self, filename, conf=''):
        """
        Parse data regularly if the data has not been loaded from a pickled version.
        """
        self.log = log.get_logger()

        # try to load the data
        internal_import = False

        # check whether it's a dictionary from which we load
        if isinstance(filename, dict):
            input_data = filename
            if 'filename' not in input_data:
                self.filename = rcParams['filename']
            internal_import = True

            # make check for correct input, there was a bug with a wrong
            # evaluation which is hopefully fixed by now
            tmp_keys = [k for k in input_data if isinstance(k, int)]
            if len(input_data[0]) != len(input_data[tmp_keys[0]]):
                print(input_data[0], input_data[tmp_keys[0]])
                raise ValueError("[!] Wrong input format!")  # pragma: no cover
        # check whether it's another wordlist-object
        elif hasattr(filename, '_data') and hasattr(filename, '_meta'):
            input_data = dict([(key, [v for v in value]) for key, value in \
                    filename._data.items()])
            input_data.update(filename._meta.items())
            input_data[0] = [
                a for a, b in sorted(
                    filename.header.items(), key=lambda x: x[1], reverse=False)
            ]
            internal_import = True
            self.filename = rcParams['filename']
        # or whether the data is an actual file
        elif isinstance(filename, string_types) and os.path.isfile(filename):
            input_data = read_qlc(filename)
            self.filename = filename
        # raise an error otherwise
        elif isinstance(filename, string_types):
            raise IOError("Input file '{0}' does not exist.".format(filename))
        else:
            raise TypeError(
                "Unrecognized type for 'filename' argument: {0}".format(
                    type(filename).__name__))

        # load the configuration file
        if not conf:
            conf = util.data_path('conf', 'qlc.rc')

        # read the file defined by its path in conf
        tmp = [line.split('\t') for line in util.read_config_file(conf)]

        # define two attributes, _alias, and _class which store the aliases and
        # the datatypes (classes) of the given entries
        self._alias, self._class, self._class_string, self._alias2 = {}, {}, {}, {}
        for name, cls, alias in tmp:
            # make sure the name itself is there
            self._alias[name.lower()] = self._alias[name.upper()] = name
            self._class[name.lower()] = self._class[name.upper()] = eval(cls)
            self._class_string[name.lower()] = self._class_string[
                name.upper()] = cls

            # add the aliases
            for a in alias.split(','):
                self._alias[a.lower()] = self._alias[a.upper()] = name
                self._class[a.lower()] = self._class[a.upper()] = eval(cls)
                self._class_string[a.lower()] = self._class_string[
                    a.upper()] = cls

            self._alias2[name] = sorted(set(alias.split(','))) + [name]

        # append the names in data[0] to self.conf to make sure that all data
        # is covered, even the types which are not specifically defined in the
        # conf file. the datatype defaults here to "str"
        for name in input_data[0]:
            if name.lower() not in self._alias:
                self._alias[name.lower()] = name.lower()
                self._class[name.lower()] = str
            if name.upper() not in self._alias:
                self._alias[name.upper()] = name.lower()
                self._class[name.upper()] = str

        # add empty alias for empty strings XXX why was that? I can't remember
        # why this was important XXX
        self._alias[''] = ''

        # the header stores the indices of the data in the original data dictionary
        self.header = dict(
            zip([self._alias[x] for x in input_data[0]],
                range(len(input_data[0]))))

        # now create a specific header which has all aliases
        self._header = {k: v for k, v in self.header.items()}

        # add a sorted header for reference
        self.columns = sorted(self.header, key=lambda x: self.header[x])

        # assign all aliases to the header
        for alias in self._alias:
            try:
                self._header[alias] = self._header[self._alias[alias]]
            except:
                pass

        # assign the data as attribute to the word list class. Note that we
        # need to check for the type here, but since numpy also offers integer
        # types, we don't check for type(x) == int, but instead use the
        # str.numeric-function that returns numeric values only if it is an
        # integer
        self._data = {
            int(k): v
            for k, v in input_data.items() if k != 0 and str(k).isnumeric()
        }
        # check for same length of all columns
        check_errors = ''
        for k, v in self._data.items():
            if len(v) != len(self.header):
                check_errors += 'Row {0} in your data contains {1} fields (expected {2})\n'.format(
                    k, len(v), len(self.header))
        if check_errors:
            raise ValueError(check_errors + '\n' +
                             ', '.join(sorted(self.header)))

        # iterate over self._data and change the values according to the
        # functions (only needed when reading from file)
        if not internal_import:
            heads = sorted(self._header.items(), key=lambda x: x[1])
            for key in self._data:
                check = []
                for head, i in heads:
                    if i not in check:
                        logstring = 'Problem with row {0} in col {1}, expected' + \
                                    ' «{4}» as datatype but received «{3}» ' + \
                                    ' (ROW: {2}, entry {5}).'
                        try:
                            self._data[key][i] = self._class[head](
                                self._data[key][i])
                            check.append(i)
                        except KeyError:
                            log.warn(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))
                        except ValueError:
                            log.warn(
                                logstring.format(
                                    key, i,
                                    '|'.join([str(x) for x in self._data[key]
                                              ]), self._data[key][i],
                                    self._class[head], head))

        # create entry attribute of the wordlist
        self.entries = sorted(
            set([b.lower() for a, b in self._alias.items() if b]))

        # assign meta-data
        self._meta = {}
        for key in [k for k in input_data if type(k) != int]:
            self._meta[key] = input_data[key]
Esempio n. 19
0
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        warn(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {k: v[1:-1]
                        for k, v in [key.split('=') for key in tmp.split(' ')[1:]]}
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append([x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                if 'id' not in keys:
                    keys['id'] = 'basic'
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' + str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
Esempio n. 20
0
def plot_heatmap(
    wordlist,
    filename="heatmap",
    fileformat="pdf",
    ref='cogid',
    normalized=False,
    refB='',
    **keywords
):
    """
    Create a heatmap-representation of shared cognates for a given wordlist.

    Parameters
    ----------
    wordlist : lingpy.basic.wordlist.Wordlist
        A Wordlist object containing cognate IDs.
    filename : str (default="heatmap")
        Name of the file to which the heatmap will be written.
    fileformat : str (default="pdf")
        A regular matplotlib-fileformat (pdf, png, pgf, svg).
    ref : str (default="cogid')
        The name of the column that contains the cognate identifiers.
    normalized : {bool str} (default=True)
        If set to c{False}, don't normalize the data. Otherwise, select the
        normalization method, choose between:
        
        * "jaccard" for the Jaccard-distance (see :evobib:`Bategelj1995` for
          details), and
        * "swadesh" for traditional lexicostatistical calculation of shared
          cognate percentages.

    cmap : matplotlib.cm (default=matplotlib.cm.jet)
        The color scheme to be used for the heatmap.
    steps : int (default=5)
        The number of steps in which names of taxa will be written to the axes.
    xrotation : int (default=45)
        The rotation of the taxon-names on the x-axis.
    colorbar : bool (default=True)
        Specify, whether a colorbar should be added to the plot.
    figsize : tuple (default=(10,10))
        Specify the size of the figure.
    tree : str (default='')
        A tree passed for the taxa in Newick-format. If no tree is specified,
        the method looks for a tree object in the Wordlist.

    Notes
    -----
    This function plots shared cognate percentages.

    """
    defaults = dict(
        bottom=0.01,  # rcParams['phybo_ylimb']
        cmap=mpl.cm.jet,
        colorbar=True,
        colorbar_label="Shared Cognates",
        colorbar_shrink=0.75,
        colorbar_textsize=10,
        figsize=(10, 5),
        height=0.8,
        labels={},  # taxon labels passed for the taxa,
        left=0.01,  # rcParams['phybo_xlimr'],
        matrix=False,
        normalization="jaccard",
        right=0.95,  # rcParams['phybo_xliml'],
        scale=0.075,
        show_tree=True,
        steps=20,
        textsize=5,
        top=0.95,  # rcParams['phybo_ylimt'],
        tree='',
        tree_bottom=0.1,
        tree_left=0.1,
        tree_width=0.2,
        vmax=1.0,
        vmin=0.0,
        width=0.8,
        xrotation=90,
        distances=False
    )
    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # access the reference tree of the wordlist and create a function that
    # orders the taxa accordingly
    if not keywords['tree']:
        try:
            tree = wordlist.tree
        except:
            raise ValueError("[i] No tree could be found")
    else:
        tree = keywords["tree"]

    # check for normalization
    if normalized:
        if normalized not in ["jaccard", "swadesh"]:
            raise ValueError(
                "Keyword 'normalized' must be one of 'jaccard','swadesh',False.")

    # create an empty matrix
    if not normalized:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=int)
    else:
        matrix = np.zeros((wordlist.width, wordlist.width), dtype=float)

    # create the figure
    fig = plt.figure(figsize=keywords['figsize'])

    # plot the reference tree
    if keywords['show_tree']:
        tree_matrix, taxa = nwk2tree_matrix(tree)
        ax1 = fig.add_axes(
            [
                keywords['left'],
                keywords['bottom'],
                0.25 * keywords['width'],
                keywords['height']
            ]
        )
        # [0.01,0.1,0.2,0.7])
        d = sch.dendrogram(
            np.array(tree_matrix),
            labels=[t for t in taxa],
            orientation='left',

        )
        taxa = d['ivl'][::-1]
        ax1.set_xticks([])
        ax1.set_yticks([])
        ax1.spines['bottom'].set_color('#ffffff')
        ax1.spines['top'].set_color('#ffffff')
        ax1.spines['left'].set_color('#ffffff')
        ax1.spines['right'].set_color('#ffffff')
        left = keywords['left'] + keywords['scale'] * keywords['width']

    else:
        left = keywords['left']
        taxa = tree.taxa

    # start iterating over taxa in order of the reference tree and fill in the
    # matrix with numbers of shared cognates
    if keywords['matrix']:
        matrix = keywords['matrix']
    else:
        for i, taxonA in enumerate(taxa):
            for j, taxonB in enumerate(taxa):
                if i < j:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=ref
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=ref
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=ref
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=ref
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warn(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                    # if refB is also a possibiltiy
                    if not refB:
                        matrix[j][i] = shared

                elif i > j and refB:
                    if normalized in [False, "jaccard"]:
                        cogsA = wordlist.get_list(
                            taxa=taxonA,
                            flat=True,
                            entry=refB
                        )
                        cogsB = wordlist.get_list(
                            taxa=taxonB,
                            flat=True,
                            entry=refB
                        )

                        cogsA, cogsB = set(cogsA), set(cogsB)

                        shared = len(cogsA.intersection(cogsB))

                        if normalized:
                            shared = shared / len(cogsA.union(cogsB))
                    else:
                        cogsA = wordlist.get_dict(
                            taxa=taxonA,
                            entry=refB
                        )
                        cogsB = wordlist.get_dict(
                            taxa=taxonB,
                            entry=refB
                        )

                        shared = 0
                        slots = 0

                        # iterate over cognate sets in meaning slots
                        for key in cogsA.keys():
                            # check whether keys are present, we follow the
                            # STARLING procedure in ignoring missing data
                            if key in cogsA and key in cogsB:

                                # check for shared items
                                if [k for k in cogsA[key] if k in cogsB[key]]:
                                    shared += 1
                                slots += 1
                        try:
                            shared = shared / slots
                        except ZeroDivisionError:
                            log.warn(str(
                                [shared, slots, len(cogsA), len(cogsB), taxonA, taxonB]))
                            shared = 0.0

                    matrix[i][j] = shared

                elif i == j:
                    cogs = wordlist.get_list(
                        taxa=taxonA,
                        flat=True,
                        entry=ref
                    )
                    if normalized:
                        matrix[i][j] = 1.0
                    else:
                        matrix[i][j] = len(set(cogs))
    ax2 = fig.add_axes(
        [
            left,  # keywords['left']+0.25 * keywords['width']+0.05,
            keywords['bottom'],
            keywords['width'],
            keywords['height']
        ]
    )
    cmap = keywords['cmap'] 

    # [0.15,0.1,0.7,0.7])
    if 'distances' in keywords:
        for i, line in enumerate(matrix):
            for j, cell in enumerate(matrix):
                matrix[i][j] = 1 - matrix[i][j]

    im = ax2.matshow(matrix, aspect='auto', origin='lower',
                     interpolation='nearest', cmap=keywords['cmap'],
                     vmax=keywords['vmax'], vmin=keywords['vmin']
                     )

    # set the xticks
    steps = int(len(taxa) / keywords['steps'] + 0.5)
    start = int(steps / 2 + 0.5)
    idxs = [0] + list(range(start, len(taxa), steps))
    selected_taxa = [taxa[i] for i in idxs]

    # modify taxon names if this is specified
    for i, t in enumerate(selected_taxa):
        if t in keywords['labels']:
            selected_taxa[i] = keywords['labels'][t]

    ax2.set_xticks([])
    ax2.set_yticks([])



    plt.xticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
        rotation=keywords['xrotation'],
        rotation_mode="default"
    )
    plt.yticks(
        idxs,
        selected_taxa,
        size=keywords['textsize'],
    )

    if keywords["colorbar"]:
        plt.imshow(matrix, cmap=keywords['cmap'], visible=False, vmax=keywords['vmax'])
        c = plt.colorbar(im, shrink=keywords['colorbar_shrink'])
        c.set_label(keywords["colorbar_label"], size=keywords['colorbar_textsize'])

    plt.subplots_adjust(
        left=keywords['left'],
        right=keywords['right'],
        top=keywords['top'],
        bottom=keywords['bottom']
    )
    plt.savefig(filename + '.' + fileformat)

    f = open(filename + '.matrix', 'w')
    for i, t in enumerate(taxa):
        f.write('{0:20}'.format(t))
        for j, c in enumerate(matrix[i]):
            if not normalized:
                f.write('\t{0:3}'.format(int(c)))
            else:
                f.write('\t{0:.2f}'.format(c))
        f.write('\n')
    f.close()
    log.file_written(filename + '.' + fileformat)
Esempio n. 21
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(kw,
                     template=False,
                     css=False,
                     comment='#',
                     filename=infile[:-4] + '.html',
                     compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(('.'.join([k for k in almA if k != '-']),
                          '.'.join([k for k in almB if k != '-'])))
            alignments.append(
                ([str(a) for a in almA], [str(b) for b in almB], 0))
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warn("Line {0} of the data is probably miscoded.".format(i +
                                                                         1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1, seq_ids[i], ids)
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Esempio n. 22
0
def psa2html(infile, **kw):
    """
    Function converts a PSA-file into colored html-format.
    """
    util.setdefaults(
        kw,
        template=False,
        css=False,
        comment='#',
        filename=infile[:-4]+'.html',
        compact=True)

    template = util.read_text_file(kw['template'] or template_path('psa.html'))
    css = util.read_text_file(kw['css'] or template_path('psa.css'))

    data = []
    for line in util.read_text_file(infile, lines=True):
        if not line.startswith(kw['comment']):
            data.append(line)

    seq_ids = []
    pairs = []
    taxa = []
    alignments = []

    del data[0]

    i = 0
    while i <= len(data) - 3:
        try:
            seq_ids.append(data[i])

            datA = data[i + 1].split('\t')
            datB = data[i + 2].split('\t')

            taxonA = datA[0].strip('.')
            taxonB = datB[0].strip('.')
            almA = datA[1:]
            almB = datB[1:]

            taxa.append((taxonA, taxonB))
            pairs.append(
                (
                    '.'.join([k for k in almA if k != '-']),
                    '.'.join([k for k in almB if k != '-'])
                )
            )
            alignments.append(
                (
                    [str(a) for a in almA],
                    [str(b) for b in almB],
                    0)
            )
            assert len(alignments[-1][0]) == len(alignments[-1][1])
            i += 4
        except AssertionError:
            log.warn("Line {0} of the data is probably miscoded.".format(i + 1))
            i += 1

    def get_classes(alm):
        classes = []
        residue = '<div class="residue {1}">{0}</div>'
        for j, char in enumerate(alm):
            if char == '-':
                d = 'dolgo_GAP'
            else:
                d = 'dolgo_' + token2class(char, rcParams['dolgo'])

                # bad check for three classes named differently
                if d == 'dolgo__':
                    d = 'dolgo_X'
                elif d == 'dolgo_1':
                    d = 'dolgo_TONE'
                elif d == 'dolgo_0':
                    d = 'dolgo_ERROR'
            classes += [residue.format(char, d)]
        return ''.join(classes)

    out = '<table>\n'  # codecs.open(kw['filename'], 'w', 'utf-8')
    for i, (a, b, c) in enumerate(alignments):
        clsA = get_classes(a)
        clsB = get_classes(b)

        ids = int(100 * pid(a, b) + 0.5)

        out += '<tr class="head">'
        out += '<td colspan=2 class="head"><b>Alignment {0}:</b> <i>{1}</i>, PID: {2}</td></tr>'.format(
            i + 1,
            seq_ids[i],
            ids
        )
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][0])
        out += '<td class="psa">{0}</td>'.format(clsA)
        out += '</tr>'
        out += '<tr class="psa">'
        out += '<td class="taxon">{0}</td>'.format(taxa[i][1])
        out += '<td class="psa">{0}</td>'.format(clsB)
        out += '</tr>'
        out += '<tr><td colspan=2></td></tr>'

    out += '</table>'

    html = template.format(alignments=out, css=css)

    if kw['compact']:
        html = html.replace('\n', ' ')
        html = re.sub(r'\s+', r' ', html)
        html = html.replace('> ', '>')
        html = html.replace(' >', '>')

    util.write_text_file(kw['filename'], html)
Esempio n. 23
0
def pid(almA, almB, mode=2):
    """
    Calculate the Percentage Identity (PID) score for aligned sequence pairs.

    Parameters
    ----------

    almA, almB : string or list
        The aligned sequences which can be either a string or a list.

    mode : { 1, 2, 3, 4, 5 }
        Indicate which of the four possible PID scores described in :evobib:`Raghava2006`
        should be calculated, the fifth possibility is added for linguistic
        purposes:
        
        1. identical positions / (aligned positions + internal gap positions),
        
        2. identical positions / aligned positions,
        
        3. identical positions / shortest sequence, or
        
        4. identical positions / shortest sequence (including internal gap
           pos.)

        5. identical positions / (aligned positions + 2 * number of gaps)

    Returns
    -------

    score : float
        The PID score of the given alignment as a floating point number between
        0 and 1.

    Notes
    -----
    
    The PID score is a common measure for the diversity of a given alignment.
    The implementation employed by LingPy follows the description of
    :evobib:`Raghava2006` where four different variants of PID scores are
    distinguished. Essentially, the PID score is based on the comparison of
    identical residue pairs with the total number of residue pairs in a given
    alignment.

    Examples
    --------
    Load an alignment from the test suite.

    >>> from lingpy import *
    >>> pairs = PSA(get_file('test.psa'))

    Extract the alignments of the first aligned sequence pair.

    >>> almA,almB,score = pairs.alignments[0]

    Calculate the PID score of the alignment.

    >>> pid(almA,almB)
    0.44444444444444442

    See also
    --------
    lingpy.compare.Multiple.get_pid

    .. todo:: change debug for ZeroDivisionError

    """

    zipped = zip(almA, almB)
    idn_pos = 0
    int_gps = 0
    aln_pos = 0

    for charA, charB in zipped:
        tmp = [charA, charB].count('-')
        if tmp == 1:
            int_gps += 1
        elif tmp == 0 and charA == charB:
            idn_pos += 1
            aln_pos += 1
        elif tmp == 0:
            aln_pos += 1

    if mode == 2:
        try:
            return idn_pos / (aln_pos + int_gps)
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 1:
        try:
            return idn_pos / aln_pos
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 3:
        srt_seq = min(
            len([i for i in almA if i != '-']), len([i for i in almB if i != '-']))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 4:
        srt_seq = min(
            len(''.join([i[0] for i in almA]).strip('-')),
            len(''.join([i[0] for i in almB]).strip('-')))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 5:
        return idn_pos / len(almA)
Esempio n. 24
0
def pid(almA, almB, mode=2):
    """
    Calculate the Percentage Identity (PID) score for aligned sequence pairs.

    Parameters
    ----------

    almA, almB : string or list
        The aligned sequences which can be either a string or a list.

    mode : { 1, 2, 3, 4, 5 }
        Indicate which of the four possible PID scores described in :evobib:`Raghava2006`
        should be calculated, the fifth possibility is added for linguistic
        purposes:
        
        1. identical positions / (aligned positions + internal gap positions),
        
        2. identical positions / aligned positions,
        
        3. identical positions / shortest sequence, or
        
        4. identical positions / shortest sequence (including internal gap
           pos.)

        5. identical positions / (aligned positions + 2 * number of gaps)

    Returns
    -------

    score : float
        The PID score of the given alignment as a floating point number between
        0 and 1.

    Notes
    -----
    
    The PID score is a common measure for the diversity of a given alignment.
    The implementation employed by LingPy follows the description of
    :evobib:`Raghava2006` where four different variants of PID scores are
    distinguished. Essentially, the PID score is based on the comparison of
    identical residue pairs with the total number of residue pairs in a given
    alignment.

    Examples
    --------
    Load an alignment from the test suite.

    >>> from lingpy import *
    >>> pairs = PSA(get_file('test.psa'))

    Extract the alignments of the first aligned sequence pair.

    >>> almA,almB,score = pairs.alignments[0]

    Calculate the PID score of the alignment.

    >>> pid(almA,almB)
    0.44444444444444442

    See also
    --------
    lingpy.compare.Multiple.get_pid

    .. todo:: change debug for ZeroDivisionError

    """

    zipped = zip(almA, almB)
    idn_pos = 0
    int_gps = 0
    aln_pos = 0

    for charA, charB in zipped:
        tmp = [charA, charB].count('-')
        if tmp == 1:
            int_gps += 1
        elif tmp == 0 and charA == charB:
            idn_pos += 1
            aln_pos += 1
        elif tmp == 0:
            aln_pos += 1

    if mode == 2:
        try:
            return idn_pos / (aln_pos + int_gps)
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 1:
        try:
            return idn_pos / aln_pos
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 3:
        srt_seq = min(
            len([i for i in almA if i != '-']), len([i for i in almB if i != '-']))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 4:
        srt_seq = min(
            len(''.join([i[0] for i in almA]).strip('-')),
            len(''.join([i[0] for i in almB]).strip('-')))
        try:
            return idn_pos / srt_seq
        except ZeroDivisionError:
            log.warn('Zero Division Error in {0} and {1}'.format(almA, almB))
            return 0

    elif mode == 5:
        return idn_pos / len(almA)