예제 #1
0
def getLetters(typ='nt', fontname='monospace', dpi=500):
    '''
    Generates a temporary image file for every letter (4 nt or 20 aa)
    Each letter extends to the full length of both axes

    Parameters
    ----------
    typ: string
        nt (default) or aa

    fontname: string
        name of a font (default: monospace)

    dpi: int
        DPI value (default: 500)

    Returns
    -------
    none
    '''
    # obtain color scheme depending on nt or aa alignment
    if typ == 'nt':
        colours = utilityFunctions.getNtColours()
    elif typ == 'aa':
        colours = utilityFunctions.getAAColours()
    # for each possible base/aa create temporary plot
    for base in colours.keys():
        f = plt.figure(figsize=(1, 1), dpi=dpi, edgecolor='black')
        a = f.add_subplot(1, 1, 1)
        a.set_xlim(0, 1)
        a.set_ylim(0, 1)
        fs = getFontSize(f, a, 1)
        a.text(0.5,
               0.02,
               base,
               fontsize=fs * 0.95,
               fontdict={
                   'family': 'monospace',
                   'name': fontname
               },
               color=colours[base],
               va='baseline',
               ha='center')
        plt.gca().set_axis_off()
        a.margins(0, 0)
        f.subplots_adjust(top=1,
                          bottom=0,
                          right=1,
                          left=0,
                          wspace=None,
                          hspace=None)
        a.set_frame_on(False)
        # temporarily save plot in working directory
        base = base.replace("*", "stop")
        f.savefig("%s_temp.png" % base, dpi=500, pad_inches=0.1)
        plt.close()
예제 #2
0
def arrNumeric(arr, typ):
    '''
    Converts the sequence array into a numerical matrix and a colour map
    which matplotlib can interpret as an image (similar to
                                                https://bit.ly/2CIKOEr)
    The rows in the array are inverted so that the output image has the rows
    in the same order as the input alignment.

    Parameters
    ----------
    arr: np.array
        The alignment stored as a numpy array

    typ: str
        Either 'aa' - amino acid - or 'nt' - nucleotide

    Returns
    -------
    arr2: np.array
        The flipped alignment as an array of integers
    cmap: matplotlib.colors.ListedColormap
        A colour map with the colours corresponding to each base
        or amino acid
    '''
    # turn the array upside down
    arr = np.flip(arr, axis=0)
    if typ == 'nt':
        D = utilityFunctions.getNtColours()
    else:
        D = utilityFunctions.getAAColours()

    # retrieve the colours for the colour map
    keys = list(D.keys())
    ali_height, ali_width = np.shape(arr)

    # make a dictionary where each integer corresponds to a base or nt
    i = 0
    nD = dict()
    colours = []
    for key in keys:
        if key in arr:
            nD[key] = i
            colours.append(D[key])
            i += 1

    arr2 = np.empty([ali_height, ali_width])

    for x in range(ali_width):
        for y in range(ali_height):
            # numeric version of the alignment array
            arr2[y, x] = nD[arr[y, x]]

    cmap = matplotlib.colors.ListedColormap(colours)
    return (arr2, cmap)
예제 #3
0
def calc_entropy(count, seq_count, typ):
    '''
    Creates a sequence logo based on an entropy calculation using bars
    Scales the bars according to the information content of the alignment
    Representation of the consensus sequence of the alignment

    Parameters
    ----------
    count: dict
        of nt/aa with counts

    seq_count: int
        number of sequences in alignment

    typ: str
        nt or aa

    Returns
    -------
    height_per_base: dictionary
        height for each nt/aa

    info_per_base: dictionary
        information content for each nt/aa

    '''
    # obtain nt/aa lists, use colour scheme for that w/o using colours here
    # just because another list of nt/aa would be obsolete
    if typ == "nt":
        element_list = utilityFunctions.getNtColours()
        s = 4
        max_entropy = log(4, 2)
    elif typ == "aa":
        element_list = utilityFunctions.getAAColours()
        s = 20
        max_entropy = log(20, 2)

    info_per_base = {}
    freq_per_base = {}
    height_per_base = {}
    entropy_per_base = {}

    for element in element_list:
        info_per_base[element] = 0
        freq_per_base[element] = 0
        height_per_base[element] = 0
        entropy_per_base[element] = 0

    # correct for small sample sizes
    sample_size_correction = (1 / log(s, 2)) * ((s - 1) / (2 * seq_count))
    gap_correction = seq_count
    if count.get("-"):
        seq_count -= count.get("-")
    # correct for gaps, since they lower the information content
    gap_correction = seq_count / gap_correction
    entropy = 0
    if seq_count == 0:
        return height_per_base, info_per_base
    # calculate entropy, from that information, from that height
    for base, quantity in count.items():
        if base != "-":
            frequency = quantity / seq_count
            freq_per_base[base] = frequency
            entropy -= frequency * log(frequency, 2)
            info_per_base[base] = max_entropy + frequency * log(frequency, 2)
            entropy_per_base[base] = -frequency * log(frequency, 2)
    information_per_column = max_entropy - entropy - sample_size_correction

    # if the information content is constant throughout the column,
    # these value will be negative. Since this does not add any information
    # set them to 0
    # they can be negative due to the sample size correction (otherwise they'd be 0)
    for base, quantity in info_per_base.items():
        if freq_per_base[base] * information_per_column < 0:
            height_per_base[base] = 0
        else:
            # scale to accomodate gaps
            height_per_base[base] = (gap_correction * freq_per_base[base] *
                                     information_per_column)

    return height_per_base, info_per_base
예제 #4
0
def sequence_bar_logo(alignment,
                      figname,
                      typ='nt',
                      figdpi=300,
                      figrowlength=50,
                      start=0,
                      end=0):
    '''
    Creates a sequence logo based on an entropy calculation using bars
    Scales the bars according to the information content of the alignment
    Representation of the consensus sequence of the alignment

    Parameters
    ----------
    alignment: np.array
        The alignment stored as a numpy array

    figname: str
        name of figure

    typ: str
        Either 'aa' - amino acid - or 'nt' - nucleotide

    figfontname: str
            Name of font, default: Arial

    figdpi: int
            DPI (default: 300)

    figrowlength: int
            clength of figure (default: 50)

    start: int
           start pos to be turned into logo

    end: int
         end pos to be turned into logo

    Returns
    -------
    none
    '''

    if start == 0 and end == 0:
        alignment_width = len(alignment[0, :])
    else:
        if end == 0:
            end = len(alignment[0, :])
        alignment_width = len(alignment[0, start:end])

    if alignment_width < figrowlength:
        figrowlength = alignment_width
    nsegs = math.ceil(alignment_width / figrowlength)
    f = plt.figure(figsize=(figrowlength / 5, nsegs * 2), dpi=figdpi)
    gs = gridspec.GridSpec(ncols=1, nrows=nsegs)
    rstart = start
    rend = rstart + figrowlength

    for n in range(nsegs):
        if rend > (alignment_width + start):
            rend = alignment_width + start
        axes = f.add_subplot(gs[n])
        axes.set_xlim(rstart - 0.5, rend - 0.5)
        if typ == 'nt':
            axes.set_ylim(0, 2.1)
            axes.set_yticks(np.arange(0, 2.1, 1))
        elif typ == 'aa':
            axes.set_ylim(0, 4.6)
            axes.set_yticks(np.arange(0, 4.6, 1))
        seq_count = len(alignment[:, 0])
        width = 0.75
        ind = np.arange(rstart, rend)

        if typ == "nt":
            element_list = utilityFunctions.getNtColours()
            colours = utilityFunctions.getNtColours()
        elif typ == "aa":
            element_list = utilityFunctions.getAAColours()
            colours = utilityFunctions.getAAColours()
        height_list = {}

        for element in element_list:
            height_list[element] = []

        bottom_height = []
        # for each column calculate heights via entropy
        # and scale letters accordlingly
        for i in range(rstart, rend):
            unique, counts = np.unique(alignment[:, i], return_counts=True)
            count = dict(zip(unique, counts))
            height_per_base, info_per_base = calc_entropy(
                count, seq_count, typ)
            bottom_height.append(0)

            # need a list of each nt/aa separately to plot them as bars
            for base, height in height_per_base.items():
                height_list[base].append(height_per_base[base])

        # stag bars on top of each other
        for base, height in height_list.items():
            plt.bar(ind,
                    height,
                    width,
                    bottom=bottom_height,
                    color=colours[base])
            bottom_height = [i + j for i, j in zip(bottom_height, height)]

        plt.xticks([rstart, rend - 1], [rstart + 1, rend])
        plt.yticks(np.arange(0, 2.1, 1))
        plt.xlabel("Position")
        plt.ylabel("Bit Score")

        axes.spines['right'].set_visible(False)
        axes.spines['top'].set_visible(False)
        rstart += figrowlength
        rend += figrowlength
    # save plot as figname
    plt.savefig(figname, bbox_inches='tight', dpi=figdpi)
    plt.close()
예제 #5
0
def sequence_logo(alignment,
                  figname,
                  typ='nt',
                  figfontname='Arial',
                  figdpi=300,
                  figrowlength=50,
                  start=0,
                  end=0):
    '''
    Creates a sequence logo based on an entropy calculation using letters
    Scales the letters according to the information content of the alignment
    Representation of the consensus sequence of the alignment

    Parameters
    ----------
    alignment: np.array
        The alignment stored as a numpy array

    figname: str
        name of figure

    typ: str
        Either 'aa' - amino acid - or 'nt' - nucleotide

    figfontname: str
            Name of font, default: Arial

    figdpi: int
            DPI (default: 300)

    figrowlength: int
            clength of figure (default: 50)

    start: int
           start pos to be turned into logo

    end: int
         end pos to be turned into logo

    Returns
    -------
    none
    '''

    if start == 0 and end == 0:
        alignment_width = len(alignment[0, :])
    else:
        if end == 0:
            end = len(alignment[0, :])
        alignment_width = len(alignment[0, start:end])

    if alignment_width < figrowlength:
        figrowlength = alignment_width
    nsegs = math.ceil(alignment_width / figrowlength)
    f = plt.figure(figsize=(figrowlength, nsegs * 2), dpi=figdpi)
    gs = gridspec.GridSpec(ncols=1, nrows=nsegs)
    getLetters(typ=typ, fontname=figfontname, dpi=figdpi)
    rstart = start
    rend = rstart + figrowlength

    for n in range(nsegs):

        if rend > (alignment_width + start):
            rend = alignment_width + start
        a = plt.subplot(gs[n])
        a.set_xlim(rstart, rstart + figrowlength)
        if typ == 'nt':
            a.set_ylim(0, 2.1)
            a.set_yticks(np.arange(0, 2.1, 1))
        elif typ == 'aa':
            a.set_ylim(0, 4.6)
            a.set_yticks(np.arange(0, 4.6, 1))
        limits = a.axis()

        # for each column calculate heights via entropy
        # and scale letters accordlingly
        for i in range(rstart, rend):
            unique, counts = np.unique(alignment[:, i], return_counts=True)
            count = dict(zip(unique, counts))
            height_per_base, info_per_base = calc_entropy(count,
                                                          len(alignment[:, 0]),
                                                          typ=typ)
            height_sum_higher = 0
            Z = zip(height_per_base.keys(), height_per_base.values())
            Z = sorted(Z, key=lambda x: x[1])
            for base, height in Z:
                if height > 0:
                    b = base.replace("*", "stop")
                    L = plt.imread("%s_temp.png" % b)
                    a.imshow(L,
                             extent=(i, i + 1, height_sum_higher,
                                     height_sum_higher + height),
                             filternorm=False)

                    height_sum_higher += height
        a.axis(limits)
        a.set_xticks([rstart, rend])
        a.set_xticklabels([rstart, rend])

        a.spines['right'].set_visible(False)
        a.spines['top'].set_visible(False)
        if n == (nsegs - 1):
            a.set_xlabel("Position")
        a.set_ylabel("Bit Score")
        rstart += figrowlength
        rend += figrowlength
    # obtain colours
    if typ == 'nt':
        allbases = utilityFunctions.getNtColours()
    elif typ == 'aa':
        allbases = utilityFunctions.getAAColours()
    for base in allbases:
        b = base.replace("*", "stop")
        os.unlink("%s_temp.png" % b)
    # save plot using figname
    f.savefig(figname, dpi=figdpi, bbox_inches='tight')
    plt.close()
예제 #6
0
    def testGetAAColours(self):
        AAcolours = utilityFunctions.getAAColours()

        self.assertEqual(len(AAcolours), 28)