Esempio n. 1
0
def plotter(title,
            df,
            kind = 'line',
            x_label = None,
            y_label = None,
            style = 'ggplot',
            figsize = (8, 4),
            save = False,
            legend_pos = 'best',
            reverse_legend = 'guess',
            num_to_plot = 7,
            tex = 'try',
            colours = 'Accent',
            cumulative = False,
            pie_legend = True,
            partial_pie = False,
            show_totals = False,
            transparent = False,
            output_format = 'png',
            interactive = False,
            black_and_white = False,
            show_p_val = False,
            indices = False,
            **kwargs):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os

    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass   
    
    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt
    
    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save = False, title = False, ext = 'png'):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re
            s = s.lower()
            s = re.sub(r"[^\w\s-]", '', s)
            s = re.sub(r"\s+", '-', s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s)
            return s
        # name as 
        if not ext.startswith('.'):
            ext = '.' + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe, was_series = False, using_tex = False, absolutes = True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index = the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, input, was_series = False, num_to_plot = 7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if 'subplots' in kwargs:
        if kwargs['subplots'] is True:
            sbplt = True
    kwargs['subplots'] = sbplt

    if colours is True:
        colours = 'Paired'

    # todo: get this dynamically instead.
    styles = ['dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white']
    #if style not in styles:
        #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if style is not False and style.startswith('seaborn'):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.get('draggable', False)
    kwargs.pop('draggable', None)

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True
    
    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif') 
            matplotlib.rc('font', serif='Helvetica Neue') 
            matplotlib.rc('text', usetex='false') 
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)  

    if interactive:
        using_tex = False 

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn('Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
            #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True
    
    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == 'total' for x in list(dataframe.columns)):
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis = ax, errors = 'ignore')
            except:
                pass
    else:
        dataframe = dataframe.drop('tkintertable-order', errors = 'ignore')
        dataframe = dataframe.drop('tkintertable-order', axis = 1, errors = 'ignore')
            
    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':
            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = ['svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf']
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats)))
    
    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format) 
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe, 
                                        kwargs['explode'], 
                                        was_series = was_series, 
                                        num_to_plot = num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', False)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis = 1, errors = 'ignore')
    except:
        pass    
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')
        else:
            warnings.warn('No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.')
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis = 0, inplace = True, errors = 'ignore')

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0,:].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):        
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ['pie', 'line', 'area']:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == 'Default':
                            colours = 'Paired'
                        kwargs['colormap'] = colours
        #else:

            if colours:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == 'Default':
                colours = 'Paired'
            kwargs['colormap'] = colours
        else:
            if num_to_plot > 0:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours
    
    # multicoloured bar charts
    if colours:
        if kind.startswith('bar'):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np
                    the_range = np.linspace(0, 1, num_to_plot)
                    cmap = plt.get_cmap(colours)
                    kwargs['colors'] = [cmap(n) for n in the_range]
                # make a bar width ... ? ...
                #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5


    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass
    
    # rotate automatically
    if 'rot' not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            #if 'kind' in kwargs:
                #if kwargs['kind'] in ['barh', 'area']:
                    #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs['rot'] = 45

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title
        
    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return
        
    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {'framealpha': .8,
                       'shadow': kwargs.get('shadow', False),
                       'ncol': kwargs.pop('ncol', 1)}    

        # determine legend position based on this dict
        if legend_pos:
            possible = {'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 
                        'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 
                        'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 
                        'outside center right': 'center left', 'outside lower right': 'lower left'}

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError('legend_pos value must be one of:\n%s\n or an int between 0-10.' %', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)
        
        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe, 
                                           was_series = was_series, 
                                           using_tex = using_tex, 
                                           absolutes = absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.index)   
    
    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1
    
    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A')
                    dataframe = dataframe.set_index(n)

        if kwargs.get('filled'):
            if areamode or kind.startswith('bar'):
                dataframe = filler(dataframe)
            kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
            0: {'marker': None, 'dash': (None,None)},
            1: {'marker': None, 'dash': [5,5]},
            2: {'marker': "o", 'dash': (None,None)},
            3: {'marker': None, 'dash': [1,3]},
            4: {'marker': "s", 'dash': [5,2,5,2,5,10]},
            5: {'marker': None, 'dash': [5,3,1,2,1,10]},
            6: {'marker': 'o', 'dash': (None,None)},
            7: {'marker': None, 'dash': [5,3,1,3]},
            8: {'marker': "1", 'dash': [1,3]},
            9: {'marker': "*", 'dash': [5,5]},
            10: {'marker': "2", 'dash': [5,2,5,2,5,10]},
            11: {'marker': "s", 'dash': (None,None)}
            }

    HATCHES = {
            0:  {'color': '#dfdfdf', 'hatch':"/"},
            1:  {'color': '#6f6f6f', 'hatch':"\\"},
            2:  {'color': 'b', 'hatch':"|"},
            3:  {'color': '#dfdfdf', 'hatch':"-"},
            4:  {'color': '#6f6f6f', 'hatch':"+"},
            5:  {'color': 'b', 'hatch':"x"}
            }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs['colormap'] = new_cmap

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None
        def __exit__(self, one, two, three):
            return False

    with plt.style.context((style)) if style != 'matplotlib' else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            ax = dataframe.plot(figsize = figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize = figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize = figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend( handles, labels, loc = leg_options['loc'], bbox_to_anchor = (0,-0.1,1,1),
                bbox_transform = plt.gcf().transFigure )

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)
                
        if 'rot' in kwargs:
            if kwargs['rot'] != 0 and kwargs['rot'] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation = kwargs['rot'], ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2:]
                        labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = ['%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            else:
                ls = ['%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals)]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels = ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)
        
    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(dataframe.index)[0] # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = 'Year'
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = 'Year'
                else:
                    x_label = 'Group'
            except:
                x_label = 'Group'

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s) # for int, long and float
        except ValueError:
            try:
                complex(s) # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter()) 
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter()) 
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'
    
    def suplabel(axis,label,label_prop=None,
                 labelpad=5,
                 ha='center',va='center'):
        ''' Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        '''
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin,ymin = min(xmin),min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation=90.
            x = xmin-float(labelpad)/dpi
            y = 0.5
        elif axis.lower() == 'x':
            rotation = 0.
            x = 0.5
            y = ymin - float(labelpad)/dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None: 
            label_prop = dict()
        plt.gcf().text(x,y,label,rotation=rotation,
                   transform=fig.transFigure,
                   ha=ha,va=va,
                   **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            #suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical')
            #plt.subplots_adjust(left=0.5)
        
        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)


    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)
    
        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            # show grid
            a.grid(b=kwargs.get('grid', False))
            kwargs.pop('grid', None)
    
    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get('grid', False))
        kwargs.pop('grid', None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0,the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score), ha = 'center', va = 'bottom')
                else:
                    plt.annotate(score, (i, score), ha = 'center', va = 'bottom')        

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if 'layout' not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder, save = save, title = title, ext = output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o'):
            plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd,), bbox_inches='tight', format = output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format = output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()


    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    if not interactive and not running_python_tex and not running_spider \
        and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
Esempio n. 2
0
def editor(interrogation, 
            operation = None,
            denominator = False,
            sort_by = False,
            keep_stats = False,
            keep_top = False,
            just_totals = False,
            threshold = 'medium',
            just_entries = False,
            skip_entries = False,
            merge_entries = False,
            newname = 'combine',
            multiple_merge = False,
            just_subcorpora = False,
            skip_subcorpora = False,
            span_subcorpora = False,
            merge_subcorpora = False,
            new_subcorpus_name = False,
            replace_names = False,
            projection = False,
            remove_above_p = False,
            p = 0.05, 
            revert_year = True,
            print_info = True,
            spelling = False,
            selfdrop = True,
            calc_all = True,
            **kwargs
            ):
    """Edit results of interrogations, do keywording, sort, etc.

    ``just/skip_entries`` and ``just/skip_subcorpora`` can take a few different kinds of input:

    * str: treated as regular expression to match
    * list: 

      * of integers: indices to match
      * of strings: entries/subcorpora to match

    ``merge_entries`` and ``merge_subcorpora``, however, are best entered as dicts:

    ``{newname: criteria, newname2: criteria2}```

    where criteria is a string, list, etc.

    :param interrogation: Results to edit
    :type interrogation: pandas.core.frame.DataFrame
    
    :param operation: Kind of maths to do on inputted lists:

        '+', '-', '/', '*', '%': self explanatory
        'k': log likelihood (keywords)
        'a': get distance metric (for use with interrogator 'a' option)
        'd': get percent difference (alternative approach to keywording)

    :type operation: str
    
    :param denominator: List of results or totals.

        If list of results, for each entry in dataframe 1, locate
        entry with same name in dataframe 2, and do maths there
        if 'self', do all merging/keeping operations, then use
        edited interrogation as denominator

    :type denominator: pandas.core.series.Series/pandas.core.frame.DataFrame/dict/'self'
    
    :param sort_by: Calculate slope, stderr, r, p values, then sort by:

        increase: highest to lowest slope value
        decrease: lowest to highest slope value
        turbulent: most change in y axis values
        static: least change in y axis values
        total/most: largest number first
        infreq/least: smallest number first
        name: alphabetically
        
    :type sort_by: str

    :param keep_stats: Keep/drop stats values from dataframe after sorting
    :type keep_stats: bool
    
    :param keep_top: After sorting, remove all but the top *keep_top* results
    :type keep_top: int
    
    :param just_totals: Sum each column and work with sums
    :type just_totals: bool
    
    :param threshold: When using results list as denominator, drop values occurring
                        fewer than n times. If not keywording, you can use:
                            ``'high'``: denominator total / 2500
                            ``'medium'``: denominator total / 5000
                            ``'low'``: denominator total / 10000
                        Note: if keywording, there are smaller default thresholds
    :type threshold: int/bool
    :param just_entries: Keep matching entries
    :type just_entries: see above
    :param skip_entries: Skip matching entries
    :type skip_entries: see above
    :param merge_entries: Merge matching entries
    :type merge_entries: see above
    :param newname: New name for merged entries
    :type newname: str/'combine'
    :param just_subcorpora: Keep matching subcorpora
    :type just_subcorpora: see above
    :param skip_subcorpora: Skip matching subcorpora
    :type skip_subcorpora: see above
    :param span_subcorpora: If subcorpora are numerically named, span all from *int* to *int2*, inclusive
    :type span_subcorpora: tuple -- ``(int, int2)``
    :param merge_subcorpora: Merge matching subcorpora
    :type merge_subcorpora: see above
    :param new_subcorpus_name: Name for merged subcorpora
    :type new_subcorpus_name: str/``'combine'``

    :param replace_names: Edit result names and then merge duplicate names.
    :type replace_names: dict -- ``{criteria: replacement_text}``; str -- a regex to delete from names
    :param projection:         a  to multiply results in subcorpus by n
    :type projection: tuple -- ``(subcorpus_name, n)``
    :param remove_above_p: Delete any result over p
    :type remove_above_p: bool
    :param p:                  set the p value
    :type p: float
    
    :param revert_year:        when doing linear regression on years, turn annual subcorpora into 1, 2 ...
    :type revert_year: bool
    
    :param print_info: Print stuff to console showing what's being edited
    :type print_info: bool
    
    :param spelling: Convert/normalise spelling:
    :type spelling: str -- ``'US'``/``'UK'``
    
    :param selfdrop: When keywording, try to remove target corpus from reference corpus
    :type selfdrop: bool
    
    :param calc_all: When keywording, calculate words that appear in either corpus
    :type calc_all: bool

    :returns: corpkit.interrogation.Interrogation
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit
    import pandas
    import signal
    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime
    
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    return_conc = False
    from interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        from editor import editor
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False
            # if df2 is also a dict, get the relevant entry
            if type(denominator) == dict or denominator.__class__ == Interrodict:
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]
                    if kwargs.get('denominator_totals'):
                        locs['denominator'] = denominator[k].totals
                    else:
                        locs['denominator'] = denominator[k].results

            outdict[k] = editor(v.results, **locs)
        if print_info:
            from time import localtime, strftime
            thetime = strftime("%H:%M:%S", localtime())
            print("\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif type(interrogation) in [pandas.core.frame.DataFrame, pandas.core.series.Series]:
        dataframe1 = interrogation
    elif interrogation.__class__ == Interrogation:
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r') :
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results
    
    elif interrogation.__class__ == Concordance or \
                        all(x in list(dataframe1.columns) for x in ['l', 'm', 'r']):
            return_conc = True
            dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    try:
        from tests import check_pytex
    except ImportError:
        from corpkit.tests import check_pytex
        
    if check_pytex():
        print_info = False

    def combiney(df, df2, operation = '%', threshold = 'medium', prinf = True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print('Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print('%s: cannot combine DataFrame 1 and 2: different shapes' % thetime)
            elif operation == 'd':
                #df.ix['Combined total'] = df.sum()
                #to_drop = to_drop = list(df.T[df.T['Combined total'] < threshold].index)
                to_drop = [n for n in list(df.columns) if df[n].sum() < threshold]
                df = df.drop([e for e in to_drop if e in list(df.columns)], axis = 1)
                #df.drop('Combined total')
                if prinf:
                    to_show = []
                    [to_show.append(w) for w in to_drop[:5]]
                    if len(to_drop) > 10:
                        to_show.append('...')
                        [to_show.append(w) for w in to_drop[-5:]]
                    if len(to_drop) > 0:
                        print('Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show)))
                    if len(to_drop) > 10:
                        print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1))
                    else:
                        print('')

                # get normalised num in target corpus
                norm_in_target = df.div(denom, axis = 0)
                # get normalised num in reference corpus, with or without selfdrop
                tot_in_ref = df.copy()
                for c in list(tot_in_ref.index):
                    if selfdrop:
                        tot_in_ref.ix[c] = df.sum() - tot_in_ref.ix[c]
                    else:
                        tot_in_ref.ix[c] = df.sum()
                norm_in_ref = tot_in_ref.div(df.sum().sum())
                df = (norm_in_target - norm_in_ref) / norm_in_ref * 100.0
                df = df.replace(float(-100.00), np.nan)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis = 1) / df2
            
            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis = 1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work 
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis = 1).T

                for index, entry in enumerate(list(df.columns)):
                    #p.animate(index)
                    if operation == '%':
                        try:
                            df[entry] = df[entry] * 100.0 / df2[entry]
                        except:
                            continue
                        #df.drop(entry, axis = 1, inplace = True)
                        #df[entry] = maths_done
                    elif operation == '+':
                        try:
                            df[entry] = df[entry] + df2[entry]
                        except:
                            continue
                    elif operation == '-':
                        try:
                            df[entry] = df[entry] - df2[entry]
                        except:
                            continue
                    elif operation == '*':
                        try:
                            df[entry] = df[entry] * df2[entry]
                        except:
                            continue
                    elif operation == '/':
                        try:
                            df[entry] = df[entry] / df2[entry]
                        except:
                            continue

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis = 1) / df2.T.sum()

        return df, totals

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if type(the_input) == int:
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif type(the_input) == str or type(the_input) == str:
            try:
                regex = re.compile(the_input)
                parsed_input = [w for w in list(df) if re.search(regex, w)]
                return parsed_input
            except:
                the_input = [the_input]
        if type(the_input) == list:
            if type(the_input[0]) == int:
                parsed_input = [word for index, word in enumerate(list(df)) if index in the_input]
            elif type(the_input[0]) == str or type(the_input[0]) == unicode:
                try:
                    parsed_input = [word for word in the_input if word in df.columns]
                except AttributeError: # if series
                    parsed_input = [word for word in the_input if word in df.index]
        return parsed_input

    def synonymise(df, pos = 'n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos = pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to = 'US', print_info = print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info = print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis = 1)
            #df = df.drop([dup for d in range(num_dupes)], axis = 1)
            df = df.drop(dup, axis = 1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info = print_info):
        """replace entry names and merge"""
        import re        
        # double or single nest if need be
        if type(replace_names) == str:
            replace_names = [(replace_names, '')]
        if type(replace_names) != dict:
            if type(replace_names[0]) == str:
                replace_names = [replace_names]
        if type(replace_names) == dict:
            replace_names = [(v, k) for k, v in list(replace_names.items())]
        for to_find, replacement in replace_names:
            if print_info:
                try:
                    print('Replacing "%s" with "%s" ...\n' % (to_find, replacement))
                except:
                    print('Deleting "%s" from entry names ...\n' % (to_find))
            to_find = re.compile(to_find)
            try:
                replacement = replacement
            except:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info = False)
        return df

    def just_these_entries(df, parsed_input, prinf = True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print('Keeping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(entries, axis = 1)
        return df

    def skip_these_entries(df, parsed_input, prinf = True):
        if prinf:     
            print('Skipping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print('... and %d more ... \n' % (len(parsed_input) - 10))
            else:
                print('')
        df = df.drop(parsed_input, axis = 1)
        return df

    def newname_getter(df, parsed_input, newname = 'combine', prinf = True, merging_subcorpora = False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if type(newname) == int:
            the_newname = list(df.columns)[newname]
        elif type(newname) == str:
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if newname is False:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0]
        if type(the_newname) != str:
            the_newname = str(the_newname, errors = 'ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf = True, merging = 'entries'):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings
            warnings.warn('No %s could be automatically merged.\n' % merging)
        else:
            if prinf:
                print('Merging %d %s as "%s":\n    %s' % (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10])))
                if len(parsed_input) > 10:
                    print('... and %d more ... \n' % (len(parsed_input) - 10))
                else:
                    print('')
        # remove old entries
        temp = sum([df[i] for i in parsed_input])
        if not multiple_merge:
            if type(df) == pandas.core.series.Series:
                df = df.drop(parsed_input)
            else:
                df = df.drop(parsed_input, axis = 1)
        df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf = True):        
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print('Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10])))
            if len(good_years) > 10:
                print('... and %d more ... \n' % (len(good_years) - 10))
            else:
                print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        if type(lst_of_subcorpora) == int:
            lst_of_subcorpora = [lst_of_subcorpora]
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings
            warnings.warn('No subcorpora skipped.\n')
        else:
            if prinf:       
                print('Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]])))
                if len(bad_years) > 10:
                    print('... and %d more ... \n' % (len(bad_years) - 10))
                else:
                    print('')
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis = 0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        """select only a span of numerical suborpora (first, last)"""
        non_totals = [subcorpus for subcorpus in list(df.index)]
        good_years = [subcorpus for subcorpus in non_totals if int(subcorpus) >= int(lst_of_subcorpora[0]) and int(subcorpus) <= int(lst_of_subcorpora[-1])]
        if len(lst_of_subcorpora) == 0:
            import warnings
            warnings.warn('Span not identified.\n')
        else:        
            if prinf:        
                print('Keeping subcorpora:\n    %d--%d\n' % (int(lst_of_subcorpora[0]), int(lst_of_subcorpora[-1])))
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        # retotal needed here
        return df

    def projector(df, list_of_tuples, prinf = True):
        """project abs values"""
        if type(list_of_tuples) == list:
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if type(subcorpus) == int:
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if type(projection_value) == float:
                    print('Projection: %s * %s' % (subcorpus, projection_value))
                if type(projection_value) == int:
                    print('Projection: %s * %d' % (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def do_stats(df):
        """do linregress and add to df"""
        try: 
            from scipy.stats import linregress
        except ImportError:
            from time import localtime, strftime
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this verion of corpkit.' % thetime)
            return False
        #from stats.stats import linregress

        entries = []
        slopes = []
        intercepts = []
        rs = []
        ps = []
        stderrs = []
        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        for entry in list(df.columns):
            entries.append(entry)
            y = list(df[entry])
            slope, intercept, r, p, stderr = linregress(x, y)
            slopes.append(slope)
            intercepts.append(intercept)
            rs.append(r)
            ps.append(p)
            stderrs.append(stderr)
        sl = pd.DataFrame([slopes, intercepts, rs, ps, stderrs], 
                           index = statfields, 
                           columns = list(df.columns))
        df = df.append(sl)
        # drop infinites and nans
        if operation != 'd':
            df = df.replace([np.inf, -np.inf], np.nan)
            df = df.fillna(0.0)
        return df

    def recalc(df, operation = '%'):
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        """Add totals to the dataframe1"""

        #df.drop('Total', axis = 0, inplace = True)
        #df.drop('Total', axis = 1, inplace = True)
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results, potentially using scipy's linregress"""
        
        # translate options and make sure they are parseable
        options = ['total', 'name', 'infreq', 'increase', 'turbulent',
                   'decrease', 'static', 'most', 'least', 'none', 'p']

        if sort_by is True:
            sort_by = 'total'
        if sort_by == 'most':
            sort_by = 'total'
        if sort_by == 'least':
            sort_by = 'infreq'
        if sort_by not in options and sort_by:
            raise ValueError("sort_by parameter error: '%s' not recognised. Must be True, False, %s" % (sort_by, ', '.join(options)))

        if operation.startswith('k'):
            if type(df) == pandas.core.series.Series:
                if sort_by == 'total':
                    df = df.order(ascending = False)

                elif sort_by == 'infreq':
                    df = df.order(ascending = True)

                elif sort_by == 'name':
                    df = df.sort_index()
                return df

        if just_totals:
            if sort_by == 'infreq':
                df = df.sort_values(by = 'Combined total', ascending = True, axis = 1)
            elif sort_by == 'total':
                df = df.sort_values(by = 'Combined total', ascending = False, axis = 1)
            elif sort_by == 'name':
                df = df.sort_index()
            return df

        # this is really shitty now that i know how to sort, like in the above
        if keep_stats:
            df = do_stats(df)
            if type(df) == bool:
                if df is False:
                    return False
        if sort_by == 'total':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()[::-1]]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'infreq':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'name':
            # currently case sensitive...
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by == 'p':
            df = df.T.sort_values(by='p').T
        else:
            statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
            
            if not keep_stats:
                df = do_stats(df)
                if type(df) == bool:
                    if df is False:
                        return False

            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                # the easy way to do it!
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

            # remove stats field by default
            if not keep_stats:
                df = df.drop(statfields, axis = 0)

        return df

    def set_threshold(big_list, threshold, prinf = True, for_keywords = False):
        if type(threshold) == str:
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500

            if type(big_list) == pandas.core.frame.DataFrame:
                tot = big_list.sum().sum()

            if type(big_list) == pandas.core.series.Series:
                tot = big_list.sum()
            the_threshold = float(tot) / float(denominator)
            #if for_keywords:
                #the_threshold = the_threshold / 2
        else:
            the_threshold = threshold
        if prinf:
            print('Threshold: %d\n' % the_threshold)
        return the_threshold

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    # do concordance work
    if return_conc:
        if just_entries:
            if type(just_entries) == int:
                just_entries = [just_entries]
            if type(just_entries) == str:
                df = df[df['m'].str.contains(just_entries)]
            if type(just_entries) == list:
                if all(type(e) == str for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if type(skip_entries) == int:
                skip_entries = [skip_entries]
            if type(skip_entries) == str:
                df = df[~df['m'].str.contains(skip_entries)]
            if type(skip_entries) == list:
                if all(type(e) == str for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis = 0)

        if just_subcorpora:
            if type(just_subcorpora) == int:
                just_subcorpora = [just_subcorpora]
            if type(just_subcorpora) == str:
                df = df[df['c'].str.contains(just_subcorpora)]
            if type(just_subcorpora) == list:
                if all(type(e) == str for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if type(skip_subcorpora) == int:
                skip_subcorpora = [skip_subcorpora]
            if type(skip_subcorpora) == str:
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if type(skip_subcorpora) == list:
                if all(type(e) == str for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis = 0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if type(df) == pandas.core.series.Series:
        df1_istotals = True
        df = pandas.DataFrame(df)
        # if just a single result
    else:
        df = pandas.DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and type(denominator) != str:
        df2 = denominator.copy()
        using_totals = True
        if type(df2) == pandas.core.frame.DataFrame:
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = pandas.Series(df2)
            if operation == 'd':
                df2 = df2.sum(axis = 1)
                single_totals = True
        elif type(df2) == pandas.core.series.Series:
            single_totals = True
            #if operation == 'k':
                #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
        else:
            raise ValueError('Denominator not recognised.')
    else:
        if operation in ['k', 'd', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'         
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to = spelling)
        df = merge_duplicates(df, print_info = False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to = spelling, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis = 0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis = 0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis = ax, errors = 'ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:

            df2 = df2.drop(name, axis = ax, errors = 'ignore')
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if type(merge_entries) != list:
            if type(merge_entries) == str or type(merge_entries) == str:
                merge_entries = {newname: merge_entries}
            # for newname, criteria    
            for name, the_input in sorted(merge_entries.items()):
                the_newname = newname_getter(df, parse_input(df, the_input), newname = name, prinf = print_info)
                df = merge_these_entries(df, parse_input(df, the_input), the_newname, prinf = print_info)
                if not single_totals:
                    df2 = merge_these_entries(df2, parse_input(df2, the_input), the_newname, prinf = False)
        else:
            for i in merge_entries:
                the_newname = newname_getter(df, parse_input(df, merge_entries), newname = newname, prinf = print_info)
                df = merge_these_entries(df, parse_input(df, merge_entries), the_newname, prinf = print_info)
                if not single_totals:
                    df2 = merge_these_entries(df2, parse_input(df2, merge_entries), the_newname, prinf = False)
    
    if merge_subcorpora:
        if type(merge_subcorpora) != dict:
            if type(merge_subcorpora) == list:
                if type(merge_subcorpora[0]) == tuple:
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif type(merge_subcorpora[0]) == str or type(merge_subcorpora[0]) == str:
                    merge_subcorpora = {new_subcorpus_name: [x for x in merge_subcorpora]}
                elif type(merge_subcorpora[0]) == int:
                    merge_subcorpora = {new_subcorpus_name: [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {new_subcorpus_name: merge_subcorpora}
        for name, the_input in sorted(merge_subcorpora.items()):
            the_newname = newname_getter(df.T, parse_input(df.T, the_input), 
                                     newname = name, 
                                     merging_subcorpora = True,
                                     prinf = print_info)
            df = merge_these_entries(df.T, parse_input(df.T, the_input), the_newname, merging = 'subcorpora', prinf = print_info).T
            if using_totals:
                df2 = merge_these_entries(df2.T, parse_input(df2.T, the_input), the_newname, merging = 'subcorpora', prinf = False).T
    
    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf = print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf = False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf = print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf = False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf = print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf = False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf = print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf = False)
    
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf = print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf = False)

    # drop infinites and nans
    if operation != 'd':
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # make just_totals as dataframe
    just_one_total_number = False
    if just_totals:
        df = pd.DataFrame(df.sum(), columns = ['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = pd.DataFrame(df2.sum(), columns = ['Combined total'])
            else:
                just_one_total_number = True
                df2 = df2.sum()

    tots = df.sum(axis = 1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            the_threshold = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    the_threshold = set_threshold(df2, threshold, prinf = print_info)
            if operation == 'd':
                the_threshold = set_threshold(df2, threshold, prinf = print_info) 
            df, tots = combiney(df, df2, operation = operation, threshold = the_threshold, prinf = print_info)
    
    # if doing keywording...
    if operation.startswith('k'):
        from keys import keywords

        # allow saved dicts to be df2, etc
        try:
            if denominator == 'self':
                df2 = df.copy()
        except TypeError:
            pass
        if type(denominator) == str:
            if denominator != 'self':
                df2 = denominator
    
        else:
            the_threshold = False

        df = keywords(df, df2, 
                      selfdrop = selfdrop, 
                      threshold = threshold, 
                      printstatus = print_info,
                      editing = True,
                      calc_all = calc_all,
                      **kwargs)

        # eh?
        df = df.T
    
    # drop infinites and nans
    if operation != 'd':
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats = keep_stats, sort_by = sort_by)
        if type(df) == bool:
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = pd.Series(df['Combined total'], name = 'Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = pd.Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0,:]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = pd.Series(df['Total'], name = 'Total')
            except:
                pass
                total = 'none'
            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis = 1)
        except:
            total = 'none'
    
    if type(tots) != pandas.core.frame.DataFrame and type(tots) != pandas.core.series.Series:
        total = df.sum(axis = 1)
    else:
        total = tots

    if type(df) == pandas.core.frame.DataFrame:
        datatype = df.ix[0].dtype
    else:
        datatype = df.dtype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        if type(df) != pandas.core.series.Series:
            df = df.T
            df = df.drop('tkintertable-order', errors = 'ignore', axis = 0)
            df = df.drop('tkintertable-order', errors = 'ignore', axis = 1)
            df['tkintertable-order'] = pd.Series([index for index, data in enumerate(list(df.index))], index = list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    try:
        from tests import check_t_kinter
    except ImportError:
        from corpkit.tests import check_t_kinter
    tk = check_t_kinter()
    if tk:
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if type(df) == pandas.core.series.Series:
            df = pandas.DataFrame(df)

    #outputnames = collections.namedtuple('edited_interrogation', ['query', 'results', 'totals'])
    #output = outputnames(the_options, df, total)

    # delete non-appearing conc lines
    if interrogation.__dict__.get('concordance', None) is None:
        lns = None
    else:
        col_crit = interrogation.concordance['m'].map(lambda x: x in list(df.columns))
        ind_crit = interrogation.concordance['c'].map(lambda x: x in list(df.index))
        lns = interrogation.concordance[col_crit]
        lns = lns.loc[ind_crit]
        lns = Concordance(lns)
    
    output = Interrogation(results = df, totals = total, query = locs, concordance = lns)

    #print '\nResult (sample)\n'
    if print_info:
        #if merge_entries or merge_subcorpora or span_subcorpora or just_subcorpora or \
           #just_entries or skip_entries or skip_subcorpora or printed_th or projection:
        print('***Done!***\n========================\n')
    #print df.head().T
    #print ''
    if operation.startswith('k') or just_totals or df1_istotals:
        pd.set_option('display.max_rows', 30)
    else:
        pd.set_option('display.max_rows', 15)
    pd.set_option('display.max_columns', 8)
    pd.set_option('max_colwidth',70)
    pd.set_option('display.width', 800)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    return output
Esempio n. 3
0
def editor(
    interrogation,
    operation=None,
    denominator=False,
    sort_by=False,
    keep_stats=False,
    keep_top=False,
    just_totals=False,
    threshold="medium",
    just_entries=False,
    skip_entries=False,
    merge_entries=False,
    newname="combine",
    multiple_merge=False,
    just_subcorpora=False,
    skip_subcorpora=False,
    span_subcorpora=False,
    merge_subcorpora=False,
    new_subcorpus_name=False,
    replace_names=False,
    projection=False,
    remove_above_p=False,
    p=0.05,
    revert_year=True,
    print_info=True,
    spelling=False,
    selfdrop=True,
    calc_all=True,
    **kwargs
):
    """Edit results of interrogations, do keywording, sort, etc.

    ``just/skip_entries`` and ``just/skip_subcorpora`` can take a few different kinds of input:

    * str: treated as regular expression to match
    * list: 

      * of integers: indices to match
      * of strings: entries/subcorpora to match

    ``merge_entries`` and ``merge_subcorpora``, however, are best entered as dicts:

    ``{newname: criteria, newname2: criteria2}```

    where criteria is a string, list, etc.

    :param interrogation: Results to edit
    :type interrogation: pandas.core.frame.DataFrame
    
    :param operation: Kind of maths to do on inputted lists:

        '+', '-', '/', '*', '%': self explanatory
        'k': log likelihood (keywords)
        'a': get distance metric (for use with interrogator 'a' option)
        'd': get percent difference (alternative approach to keywording)

    :type operation: str
    
    :param denominator: List of results or totals.

        If list of results, for each entry in dataframe 1, locate
        entry with same name in dataframe 2, and do maths there
        if 'self', do all merging/keeping operations, then use
        edited interrogation as denominator

    :type denominator: pandas.core.series.Series/pandas.core.frame.DataFrame/dict/'self'
    
    :param sort_by: Calculate slope, stderr, r, p values, then sort by:

        increase: highest to lowest slope value
        decrease: lowest to highest slope value
        turbulent: most change in y axis values
        static: least change in y axis values
        total/most: largest number first
        infreq/least: smallest number first
        name: alphabetically
        
    :type sort_by: str

    :param keep_stats: Keep/drop stats values from dataframe after sorting
    :type keep_stats: bool
    
    :param keep_top: After sorting, remove all but the top *keep_top* results
    :type keep_top: int
    
    :param just_totals: Sum each column and work with sums
    :type just_totals: bool
    
    :param threshold: When using results list as denominator, drop values occurring
                        fewer than n times. If not keywording, you can use:
                            ``'high'``: denominator total / 2500
                            ``'medium'``: denominator total / 5000
                            ``'low'``: denominator total / 10000
                        Note: if keywording, there are smaller default thresholds
    :type threshold: int/bool
    :param just_entries: Keep matching entries
    :type just_entries: see above
    :param skip_entries: Skip matching entries
    :type skip_entries: see above
    :param merge_entries: Merge matching entries
    :type merge_entries: see above
    :param newname: New name for merged entries
    :type newname: str/'combine'
    :param just_subcorpora: Keep matching subcorpora
    :type just_subcorpora: see above
    :param skip_subcorpora: Skip matching subcorpora
    :type skip_subcorpora: see above
    :param span_subcorpora: If subcorpora are numerically named, span all from *int* to *int2*, inclusive
    :type span_subcorpora: tuple -- ``(int, int2)``
    :param merge_subcorpora: Merge matching subcorpora
    :type merge_subcorpora: see above
    :param new_subcorpus_name: Name for merged subcorpora
    :type new_subcorpus_name: str/``'combine'``

    :param replace_names: Edit result names and then merge duplicate names.
    :type replace_names: dict -- ``{criteria: replacement_text}``; str -- a regex to delete from names
    :param projection:         a  to multiply results in subcorpus by n
    :type projection: tuple -- ``(subcorpus_name, n)``
    :param remove_above_p: Delete any result over p
    :type remove_above_p: bool
    :param p:                  set the p value
    :type p: float
    
    :param revert_year:        when doing linear regression on years, turn annual subcorpora into 1, 2 ...
    :type revert_year: bool
    
    :param print_info: Print stuff to console showing what's being edited
    :type print_info: bool
    
    :param spelling: Convert/normalise spelling:
    :type spelling: str -- ``'US'``/``'UK'``
    
    :param selfdrop: When keywording, try to remove target corpus from reference corpus
    :type selfdrop: bool
    
    :param calc_all: When keywording, calculate words that appear in either corpus
    :type calc_all: bool

    :returns: corpkit.interrogation.Interrogation
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit
    import pandas
    import signal
    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime

    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    return_conc = False
    from interrogation import Interrodict, Interrogation, Concordance

    if interrogation.__class__ == Interrodict:
        from collections import OrderedDict

        outdict = OrderedDict()
        from editor import editor

        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i == 0:
                pass
                # saved_args['print_info'] = True
            else:
                locs["print_info"] = False
            # if df2 is also a dict, get the relevant entry
            if type(denominator) == dict:
                if sorted(set([i.lower() for i in list(dataframe1.keys())])) == sorted(
                    set([i.lower() for i in list(denominator.keys())])
                ):
                    locs["denominator"] = denominator[k]

                    if kwargs.get("use_df2_totals"):
                        saved_args["denominator"] = denominator[k].totals
            outdict[k] = editor(v.results, **saved_args)
        if print_info:
            from time import localtime, strftime

            thetime = strftime("%H:%M:%S", localtime())
            print(
                "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
                % (thetime, "'\n         '".join(sorted(outdict.keys())))
            )
        return Interrodict(outdict)

    elif type(interrogation) in [pandas.core.frame.DataFrame, pandas.core.series.Series]:
        dataframe1 = interrogation
    elif interrogation.__class__ == Interrogation:
        # if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop("branch", "results")
        if branch.lower().startswith("r"):
            dataframe1 = interrogation.results
        elif branch.lower().startswith("t"):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith("c"):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results

    elif interrogation.__class__ == Concordance or all(x in list(dataframe1.columns) for x in ["l", "m", "r"]):
        return_conc = True
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None
    pd.set_option("display.float_format", lambda x: "%.2f" % x)

    from tests import check_pytex

    if check_pytex():
        print_info = False

    def combiney(df, df2, operation="%", threshold="medium", prinf=True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2["Combined total"] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append("...")
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print(
                                "Removing %d entries below threshold:\n    %s" % (len(to_drop), "\n    ".join(to_show))
                            )
                        if len(to_drop) > 10:
                            print("... and %d more ... \n" % (len(to_drop) - len(to_show) + 1))
                        else:
                            print("")
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == "%":
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:
                    from time import localtime, strftime

                    thetime = strftime("%H:%M:%S", localtime())
                    print("%s: cannot combine DataFrame 1 and 2: different shapes" % thetime)
            elif operation == "+":
                try:
                    df = df.add(denom, axis=0)
                except ValueError:
                    from time import localtime, strftime

                    thetime = strftime("%H:%M:%S", localtime())
                    print("%s: cannot combine DataFrame 1 and 2: different shapes" % thetime)
            elif operation == "-":
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:
                    from time import localtime, strftime

                    thetime = strftime("%H:%M:%S", localtime())
                    print("%s: cannot combine DataFrame 1 and 2: different shapes" % thetime)
            elif operation == "*":
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:
                    from time import localtime, strftime

                    thetime = strftime("%H:%M:%S", localtime())
                    print("%s: cannot combine DataFrame 1 and 2: different shapes" % thetime)
            elif operation == "/":
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:
                    from time import localtime, strftime

                    thetime = strftime("%H:%M:%S", localtime())
                    print("%s: cannot combine DataFrame 1 and 2: different shapes" % thetime)
            elif operation == "d":
                # df.ix['Combined total'] = df.sum()
                # to_drop = to_drop = list(df.T[df.T['Combined total'] < threshold].index)
                to_drop = [n for n in list(df.columns) if df[n].sum() < threshold]
                df = df.drop([e for e in to_drop if e in list(df.columns)], axis=1)
                # df.drop('Combined total')
                if prinf:
                    to_show = []
                    [to_show.append(w) for w in to_drop[:5]]
                    if len(to_drop) > 10:
                        to_show.append("...")
                        [to_show.append(w) for w in to_drop[-5:]]
                    if len(to_drop) > 0:
                        print("Removing %d entries below threshold:\n    %s" % (len(to_drop), "\n    ".join(to_show)))
                    if len(to_drop) > 10:
                        print("... and %d more ... \n" % (len(to_drop) - len(to_show) + 1))
                    else:
                        print("")

                # get normalised num in target corpus
                norm_in_target = df.div(denom, axis=0)
                # get normalised num in reference corpus, with or without selfdrop
                tot_in_ref = df.copy()
                for c in list(tot_in_ref.index):
                    if selfdrop:
                        tot_in_ref.ix[c] = df.sum() - tot_in_ref.ix[c]
                    else:
                        tot_in_ref.ix[c] = df.sum()
                norm_in_ref = tot_in_ref.div(df.sum().sum())
                df = (norm_in_target - norm_in_ref) / norm_in_ref * 100.0
                df = df.replace(float(-100.00), np.nan)

            elif operation == "a":
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2

            elif operation.startswith("c"):
                import warnings

                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith("a"):
                # generate totals
                if operation == "%":
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == "*":
                    totals = df.sum() * float(df2.sum().sum())
                if operation == "/":
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith("c"):
                    # add here the info that merging will not work
                    # with identical colnames
                    import warnings

                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby("index").sum()
                        dx = d.reset_index("index")
                        dx.index = list(dx["index"])
                        df = dx.drop("index", axis=1).T

                for index, entry in enumerate(list(df.columns)):
                    # p.animate(index)
                    if operation == "%":
                        try:
                            df[entry] = df[entry] * 100.0 / df2[entry]
                        except:
                            continue
                        # df.drop(entry, axis = 1, inplace = True)
                        # df[entry] = maths_done
                    elif operation == "+":
                        try:
                            df[entry] = df[entry] + df2[entry]
                        except:
                            continue
                    elif operation == "-":
                        try:
                            df[entry] = df[entry] - df2[entry]
                        except:
                            continue
                    elif operation == "*":
                        try:
                            df[entry] = df[entry] * df2[entry]
                        except:
                            continue
                    elif operation == "/":
                        try:
                            df[entry] = df[entry] / df2[entry]
                        except:
                            continue

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""

        # FIX MERGE ERROR HERE
        parsed_input = False

        import re

        if the_input == "all":
            the_input = r".*"
        if type(the_input) == int:
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif type(the_input) == str or type(the_input) == str:
            try:
                regex = re.compile(the_input)
                parsed_input = [w for w in list(df) if re.search(regex, w)]
                return parsed_input
            except:
                the_input = [the_input]
        if type(the_input) == list:
            if type(the_input[0]) == int:
                parsed_input = [word for index, word in enumerate(list(df)) if index in the_input]
            elif type(the_input[0]) == str or type(the_input[0]) == str:
                try:
                    parsed_input = [word for word in the_input if word in df.columns]
                except AttributeError:  # if series
                    parsed_input = [word for word in the_input if word in df.index]

        return parsed_input

    def synonymise(df, pos="n"):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn

        # from dictionaries.taxonomies import taxonomies
        from collections import Counter

        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to="US", print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert

        if print_info:
            print("Converting spelling ... \n")
        if convert_to == "UK":
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print("Merging duplicate entries ... \n")
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            # num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            # df = df.drop([dup for d in range(num_dupes)], axis = 1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re

        # double or single nest if need be
        if type(replace_names) == str:
            replace_names = [(replace_names, "")]
        if type(replace_names) != dict:
            if type(replace_names[0]) == str:
                replace_names = [replace_names]
        if type(replace_names) == dict:
            replace_names = [(v, k) for k, v in list(replace_names.items())]
        for to_find, replacement in replace_names:
            if print_info:
                try:
                    print('Replacing "%s" with "%s" ...\n' % (to_find, replacement))
                except:
                    print('Deleting "%s" from entry names ...\n' % (to_find))
            to_find = re.compile(to_find)
            try:
                replacement = replacement
            except:
                replacement = ""
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info=False)
        return df

    def just_these_entries(df, parsed_input, prinf=True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print("Keeping %d entries:\n    %s" % (len(parsed_input), "\n    ".join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print("... and %d more ... \n" % (len(parsed_input) - 10))
            else:
                print("")
        df = df.drop(entries, axis=1)
        return df

    def skip_these_entries(df, parsed_input, prinf=True):
        if prinf:
            print("Skipping %d entries:\n    %s" % (len(parsed_input), "\n    ".join(parsed_input[:10])))
            if len(parsed_input) > 10:
                print("... and %d more ... \n" % (len(parsed_input) - 10))
            else:
                print("")
        df = df.drop(parsed_input, axis=1)
        return df

    def newname_getter(df, parsed_input, newname="combine", prinf=True, merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = "combine"
        if type(newname) == int:
            the_newname = list(df.columns)[newname]
        elif type(newname) == str:
            if newname == "combine":
                if len(parsed_input) <= 3:
                    the_newname = "/".join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = "/".join(parsed_input[:3]) + "..."
            else:
                the_newname = newname
        if newname is False:
            # revise this code
            import operator

            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0]
        if type(the_newname) != str:
            the_newname = str(the_newname, errors="ignore")
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf=True, merging="entries"):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings

            warnings.warn("No %s could be automatically merged.\n" % merging)
        else:
            if prinf:
                print(
                    'Merging %d %s as "%s":\n    %s'
                    % (len(parsed_input), merging, the_newname, "\n    ".join(parsed_input[:10]))
                )
                if len(parsed_input) > 10:
                    print("... and %d more ... \n" % (len(parsed_input) - 10))
                else:
                    print("")
        # remove old entries
        temp = sum([df[i] for i in parsed_input])
        if not multiple_merge:
            if type(df) == pandas.core.series.Series:
                df = df.drop(parsed_input)
            else:
                df = df.drop(parsed_input, axis=1)
        df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print("Keeping %d subcorpora:\n    %s" % (len(good_years), "\n    ".join(good_years[:10])))
            if len(good_years) > 10:
                print("... and %d more ... \n" % (len(good_years) - 10))
            else:
                print("")
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        if type(lst_of_subcorpora) == int:
            lst_of_subcorpora = [lst_of_subcorpora]
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings

            warnings.warn("No subcorpora skipped.\n")
        else:
            if prinf:
                print(
                    "Skipping %d subcorpora:\n    %s"
                    % (len(bad_years), "\n    ".join([str(i) for i in bad_years[:10]]))
                )
                if len(bad_years) > 10:
                    print("... and %d more ... \n" % (len(bad_years) - 10))
                else:
                    print("")
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis=0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf=True):
        """select only a span of numerical suborpora (first, last)"""
        non_totals = [subcorpus for subcorpus in list(df.index)]
        good_years = [
            subcorpus
            for subcorpus in non_totals
            if int(subcorpus) >= int(lst_of_subcorpora[0]) and int(subcorpus) <= int(lst_of_subcorpora[-1])
        ]
        if len(lst_of_subcorpora) == 0:
            import warnings

            warnings.warn("Span not identified.\n")
        else:
            if prinf:
                print("Keeping subcorpora:\n    %d--%d\n" % (int(lst_of_subcorpora[0]), int(lst_of_subcorpora[-1])))
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis=0)
        # retotal needed here
        return df

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if type(list_of_tuples) == list:
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if type(subcorpus) == int:
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if type(projection_value) == float:
                    print("Projection: %s * %s" % (subcorpus, projection_value))
                if type(projection_value) == int:
                    print("Projection: %s * %d" % (subcorpus, projection_value))
        if prinf:
            print("")
        return df

    def do_stats(df):
        """do linregress and add to df"""
        try:
            from scipy.stats import linregress
        except ImportError:
            from time import localtime, strftime

            thetime = strftime("%H:%M:%S", localtime())
            print("%s: sort type not available in this verion of corpkit." % thetime)
            return False
        # from stats.stats import linregress

        entries = []
        slopes = []
        intercepts = []
        rs = []
        ps = []
        stderrs = []
        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))
        statfields = ["slope", "intercept", "r", "p", "stderr"]
        for entry in list(df.columns):
            entries.append(entry)
            y = list(df[entry])
            slope, intercept, r, p, stderr = linregress(x, y)
            slopes.append(slope)
            intercepts.append(intercept)
            rs.append(r)
            ps.append(p)
            stderrs.append(stderr)
        sl = pd.DataFrame([slopes, intercepts, rs, ps, stderrs], index=statfields, columns=list(df.columns))
        df = df.append(sl)
        # drop infinites and nans
        if operation != "d":
            df = df.replace([np.inf, -np.inf], np.nan)
            df = df.fillna(0.0)
        return df

    def recalc(df, operation="%"):
        statfields = ["slope", "intercept", "r", "p", "stderr"]
        """Add totals to the dataframe1"""

        # df.drop('Total', axis = 0, inplace = True)
        # df.drop('Total', axis = 1, inplace = True)
        try:
            df["temp-Total"] = df.drop(statfields).sum(axis=1)
        except:
            df["temp-Total"] = df.sum(axis=1)
        df = df.T
        try:
            df["temp-Total"] = df.drop(statfields).sum(axis=1)
        except:
            df["temp-Total"] = df.sum(axis=1)
        df = df.T
        return df

    def resort(df, sort_by=False, keep_stats=False):
        """sort results, potentially using scipy's linregress"""

        # translate options and make sure they are parseable
        options = [
            "total",
            "name",
            "infreq",
            "increase",
            "turbulent",
            "decrease",
            "static",
            "most",
            "least",
            "none",
            "p",
        ]

        if sort_by is True:
            sort_by = "total"
        if sort_by == "most":
            sort_by = "total"
        if sort_by == "least":
            sort_by = "infreq"
        if sort_by not in options and sort_by:
            raise ValueError(
                "sort_by parameter error: '%s' not recognised. Must be True, False, %s" % (sort_by, ", ".join(options))
            )

        if operation.startswith("k"):
            if type(df) == pandas.core.series.Series:
                if sort_by == "total":
                    df = df.order(ascending=False)

                elif sort_by == "infreq":
                    df = df.order(ascending=True)

                elif sort_by == "name":
                    df = df.sort_index()
                return df

        if just_totals:
            if sort_by == "infreq":
                df = df.sort_values(by="Combined total", ascending=True, axis=1)
            elif sort_by == "total":
                df = df.sort_values(by="Combined total", ascending=False, axis=1)
            elif sort_by == "name":
                df = df.sort_index()
            return df

        # this is really shitty now that i know how to sort, like in the above
        if keep_stats:
            df = do_stats(df)
            if type(df) == bool:
                if df is False:
                    return False
        if sort_by == "total":
            if df1_istotals:
                df = df.T
            df = recalc(df, operation=operation)
            tot = df.ix["temp-Total"]
            df = df[tot.argsort()[::-1]]
            df = df.drop("temp-Total", axis=0)
            df = df.drop("temp-Total", axis=1)
            if df1_istotals:
                df = df.T
        elif sort_by == "infreq":
            if df1_istotals:
                df = df.T
            df = recalc(df, operation=operation)
            tot = df.ix["temp-Total"]
            df = df[tot.argsort()]
            df = df.drop("temp-Total", axis=0)
            df = df.drop("temp-Total", axis=1)
            if df1_istotals:
                df = df.T
        elif sort_by == "name":
            # currently case sensitive...
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by == "p":
            df = df.T.sort_values(by="p").T
        else:
            statfields = ["slope", "intercept", "r", "p", "stderr"]

            if not keep_stats:
                df = do_stats(df)
                if type(df) == bool:
                    if df is False:
                        return False

            slopes = df.ix["slope"]
            if sort_by == "increase":
                df = df[slopes.argsort()[::-1]]
            elif sort_by == "decrease":
                df = df[slopes.argsort()]
            elif sort_by == "static":
                df = df[slopes.abs().argsort()]
            elif sort_by == "turbulent":
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                # the easy way to do it!
                df = df.T
                df = df[df["p"] <= p]
                df = df.T

            # remove stats field by default
            if not keep_stats:
                df = df.drop(statfields, axis=0)

        return df

    def set_threshold(big_list, threshold, prinf=True, for_keywords=False):
        if type(threshold) == str:
            if threshold.startswith("l"):
                denominator = 10000
            if threshold.startswith("m"):
                denominator = 5000
            if threshold.startswith("h"):
                denominator = 2500

            if type(big_list) == pandas.core.frame.DataFrame:
                tot = big_list.sum().sum()

            if type(big_list) == pandas.core.series.Series:
                tot = big_list.sum()
            the_threshold = float(tot) / float(denominator)
            # if for_keywords:
            # the_threshold = the_threshold / 2
        else:
            the_threshold = threshold
        if prinf:
            print("Threshold: %d\n" % the_threshold)
        return the_threshold

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = "None"

    # do concordance work
    if return_conc:
        if just_entries:
            if type(just_entries) == int:
                just_entries = [just_entries]
            if type(just_entries) == str:
                df = df[df["m"].str.contains(just_entries)]
            if type(just_entries) == list:
                if all(type(e) == str for e in just_entries):
                    mp = df["m"].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if type(skip_entries) == int:
                skip_entries = [skip_entries]
            if type(skip_entries) == str:
                df = df[~df["m"].str.contains(skip_entries)]
            if type(skip_entries) == list:
                if all(type(e) == str for e in skip_entries):
                    mp = df["m"].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if type(just_subcorpora) == int:
                just_subcorpora = [just_subcorpora]
            if type(just_subcorpora) == str:
                df = df[df["c"].str.contains(just_subcorpora)]
            if type(just_subcorpora) == list:
                if all(type(e) == str for e in just_subcorpora):
                    mp = df["c"].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if type(skip_subcorpora) == int:
                skip_subcorpora = [skip_subcorpora]
            if type(skip_subcorpora) == str:
                df = df[~df["c"].str.contains(skip_subcorpora)]
            if type(skip_subcorpora) == list:
                if all(type(e) == str for e in skip_subcorpora):
                    mp = df["c"].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print("\n***Processing results***\n========================\n")

    df1_istotals = False
    if type(df) == pandas.core.series.Series:
        df1_istotals = True
        df = pandas.DataFrame(df)
        # if just a single result
    else:
        df = pandas.DataFrame(df)
    if operation.startswith("k"):
        if sort_by is False:
            if not df1_istotals:
                sort_by = "turbulent"
        if df1_istotals:
            df = df.T

    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator is not False and type(denominator) != str:
        df2 = denominator.copy()
        using_totals = True
        if type(df2) == pandas.core.frame.DataFrame:
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = pandas.Series(df2)
            if operation == "d":
                df2 = df2.sum(axis=1)
                single_totals = True
        elif type(df2) == pandas.core.series.Series:
            single_totals = True
            # if operation == 'k':
            # raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
        else:
            raise ValueError("Denominator not recognised.")
    else:
        if operation in ["k", "d", "a", "%", "/", "*", "-", "+"]:
            denominator = "self"
        if denominator == "self":
            outputmode = True

    if operation.startswith("a") or operation.startswith("A"):
        if list(df.columns)[0] != "0" and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = "total"

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = "total"

    # remove old stats if they're there:
    statfields = ["slope", "intercept", "r", "p", "stderr"]
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(["Total"] * 2 + ["tkintertable-order"] * 2, [0, 1, 0, 1]):
        if name == "Total" and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors="ignore")
        except:
            pass
    for name, ax in zip(["Total"] * 2 + ["tkintertable-order"] * 2, [0, 1, 0, 1]):
        if name == "Total" and single_totals:
            continue

        try:

            df2 = df2.drop(name, axis=ax, errors="ignore")
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if type(merge_entries) != list:
            if type(merge_entries) == str or type(merge_entries) == str:
                merge_entries = {newname: merge_entries}
            # for newname, criteria
            for name, the_input in sorted(merge_entries.items()):
                the_newname = newname_getter(df, parse_input(df, the_input), newname=name, prinf=print_info)
                df = merge_these_entries(df, parse_input(df, the_input), the_newname, prinf=print_info)
                if not single_totals:
                    df2 = merge_these_entries(df2, parse_input(df2, the_input), the_newname, prinf=False)
        else:
            for i in merge_entries:
                the_newname = newname_getter(df, parse_input(df, merge_entries), newname=newname, prinf=print_info)
                df = merge_these_entries(df, parse_input(df, merge_entries), the_newname, prinf=print_info)
                if not single_totals:
                    df2 = merge_these_entries(df2, parse_input(df2, merge_entries), the_newname, prinf=False)

    if merge_subcorpora:
        if type(merge_subcorpora) != dict:
            if type(merge_subcorpora) == list:
                if type(merge_subcorpora[0]) == tuple:
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif type(merge_subcorpora[0]) == str or type(merge_subcorpora[0]) == str:
                    merge_subcorpora = {new_subcorpus_name: [x for x in merge_subcorpora]}
                elif type(merge_subcorpora[0]) == int:
                    merge_subcorpora = {new_subcorpus_name: [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {new_subcorpus_name: merge_subcorpora}
        for name, the_input in sorted(merge_subcorpora.items()):
            the_newname = newname_getter(
                df.T, parse_input(df.T, the_input), newname=name, merging_subcorpora=True, prinf=print_info
            )
            df = merge_these_entries(
                df.T, parse_input(df.T, the_input), the_newname, merging="subcorpora", prinf=print_info
            ).T
            if using_totals:
                df2 = merge_these_entries(
                    df2.T, parse_input(df2.T, the_input), the_newname, merging="subcorpora", prinf=False
                ).T

    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf=print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf=False)

    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf=print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf=False)

    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf=print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf=False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf=print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf=False)
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf=print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf=False)

    # drop infinites and nans
    if operation != "d":
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # make just_totals as dataframe
    just_one_total_number = False
    if just_totals:
        df = pd.DataFrame(df.sum(), columns=["Combined total"])
        if using_totals:
            if not single_totals:
                df2 = pd.DataFrame(df2.sum(), columns=["Combined total"])
            else:
                just_one_total_number = True
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith("k"):
            the_threshold = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = "Total"
                else:
                    df2.name = "Combined total"
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    the_threshold = set_threshold(df2, threshold, prinf=print_info)
            if operation == "d":
                the_threshold = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df, df2, operation=operation, threshold=the_threshold, prinf=print_info)

    # if doing keywording...
    if operation.startswith("k"):
        from keys import keywords

        # allow saved dicts to be df2, etc
        try:
            if denominator == "self":
                df2 = df.copy()
        except TypeError:
            pass
        if type(denominator) == str:
            if denominator != "self":
                df2 = denominator

        else:
            the_threshold = False

        df = keywords(
            df,
            df2,
            selfdrop=selfdrop,
            threshold=threshold,
            printstatus=print_info,
            editing=True,
            calc_all=calc_all,
            **kwargs
        )

        # eh?
        df = df.T

    # drop infinites and nans
    if operation != "d":
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if type(df) == bool:
            if df is False:
                return "linregress"

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = pd.Series(df["Combined total"], name="Combined total")

    if df1_istotals:
        if operation.startswith("k"):
            try:
                df = pd.Series(df.ix[dataframe1.name])
                df.name = "%s: keyness" % df.name
            except:
                df = df.iloc[0, :]
                df.name = "keyness" % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith("k"):
        if not just_totals:
            try:
                total = pd.Series(df["Total"], name="Total")
            except:
                pass
                total = "none"
            # total = df.copy()
        else:
            total = "none"
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = "none"

    if type(tots) != pandas.core.frame.DataFrame and type(tots) != pandas.core.series.Series:
        total = df.sum(axis=1)
    else:
        total = tots

    if type(df) == pandas.core.frame.DataFrame:
        datatype = df.ix[0].dtype
    else:
        datatype = df.dtype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        if type(df) != pandas.core.series.Series:
            df = df.T
            df = df.drop("tkintertable-order", errors="ignore", axis=0)
            df = df.drop("tkintertable-order", errors="ignore", axis=1)
            df["tkintertable-order"] = pd.Series(
                [index for index, data in enumerate(list(df.index))], index=list(df.index)
            )
            df = df.T
        return df

    # while tkintertable can't sort rows
    from tests import check_t_kinter

    tk = check_t_kinter()
    if tk:
        df = add_tkt_index(df)

    if kwargs.get("df1_always_df"):
        if type(df) == pandas.core.series.Series:
            df = pandas.DataFrame(df)

    # outputnames = collections.namedtuple('edited_interrogation', ['query', 'results', 'totals'])
    # output = outputnames(the_options, df, total)

    # delete non-appearing conc lines
    if interrogation.__dict__.get("concordance", None) is None:
        lns = None
    else:
        col_crit = interrogation.concordance["m"].map(lambda x: x in list(df.columns))
        ind_crit = interrogation.concordance["c"].map(lambda x: x in list(df.index))
        lns = interrogation.concordance[col_crit]
        lns = lns.loc[ind_crit]
        lns = Concordance(lns)

    output = Interrogation(results=df, totals=total, query=locs, concordance=lns)

    # print '\nResult (sample)\n'
    if print_info:
        # if merge_entries or merge_subcorpora or span_subcorpora or just_subcorpora or \
        # just_entries or skip_entries or skip_subcorpora or printed_th or projection:
        print("***Done!***\n========================\n")
    # print df.head().T
    # print ''
    if operation.startswith("k") or just_totals or df1_istotals:
        pd.set_option("display.max_rows", 30)
    else:
        pd.set_option("display.max_rows", 15)
    pd.set_option("display.max_columns", 8)
    pd.set_option("max_colwidth", 70)
    pd.set_option("display.width", 800)
    pd.set_option("expand_frame_repr", False)
    pd.set_option("display.float_format", lambda x: "%.2f" % x)

    return output
Esempio n. 4
0
def plotter(df,
            title=False,
            kind='line',
            x_label=None,
            y_label=None,
            style='ggplot',
            figsize=(8, 4),
            save=False,
            legend_pos='best',
            reverse_legend='guess',
            num_to_plot=7,
            tex='try',
            colours='Accent',
            cumulative=False,
            pie_legend=True,
            partial_pie=False,
            show_totals=False,
            transparent=False,
            output_format='png',
            interactive=False,
            black_and_white=False,
            show_p_val=False,
            indices=False,
            transpose=False,
            **kwargs):
    """Visualise corpus interrogations.

    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: pandas.core.frame.DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os
    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    import matplotlib as mpl
    from matplotlib import rc

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except:
        pass

    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt

    import pandas
    from pandas import DataFrame

    import numpy
    from time import localtime, strftime
    from tests import check_pytex, check_spider, check_t_kinter

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    # check what environment we're in
    tk = check_t_kinter()
    running_python_tex = check_pytex()
    running_spider = check_spider()

    if not title:
        title = ''

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
            'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name,
                                                a=minval,
                                                b=maxval),
            cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save=False, title=False, ext='png'):
        """Come up with the savename for the image."""
        import os

        def urlify(s):
            "Turn title into filename"
            import re
            s = s.lower()
            s = re.sub(r"[^\w\s-]", '', s)
            s = re.sub(r"\s+", '-', s)
            s = re.sub(r"-(textbf|emph|textsc|textit)", '-', s)
            return s

        # name as
        if not ext.startswith('.'):
            ext = '.' + ext
        if type(save) == str:
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe,
                               was_series=False,
                               using_tex=False,
                               absolutes=True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index=the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, input, was_series=False, num_to_plot=7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]
        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if type(input) == str or type(input) == int:
            input = [input]
        if type(input) == list:
            for i in input:
                if type(i) == str:
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # check if we're doing subplots
    sbplt = False
    if 'subplots' in kwargs:
        if kwargs['subplots'] is True:
            sbplt = True
    kwargs['subplots'] = sbplt

    if colours is True:
        colours = 'Paired'

    # todo: get this dynamically instead.
    styles = [
        'dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight',
        'matplotlib', False, 'mpl-white'
    ]
    #if style not in styles:
    #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if style is not False and style.startswith('seaborn'):
        colours = False

    # use 'draggable = True' to make a draggable legend
    dragmode = kwargs.pop('draggable', False)

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True

    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif')
            matplotlib.rc('font', serif='Helvetica Neue')
            matplotlib.rc('text', usetex='false')
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)

    if interactive:
        using_tex = False

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn(
                'Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
        #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    was_series = False
    if type(dataframe) == pandas.core.series.Series:
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if transpose:
            dataframe = dataframe.T
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True

    # attempt to convert x axis to ints:
    try:
        dataframe.index = [int(i) for i in list(dataframe.index)]
    except:
        pass

    # remove totals and tkinter order
    if not was_series and not all(x.lower() == 'total'
                                  for x in list(dataframe.columns)):
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                            [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis=ax, errors='ignore')
            except:
                pass
    else:
        dataframe = dataframe.drop('tkintertable-order', errors='ignore')
        dataframe = dataframe.drop('tkintertable-order',
                                   axis=1,
                                   errors='ignore')

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':

            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except ValueError or OverflowError:
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = [
        'svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'
    ]
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' %
                         (output_format, ', '.join(output_formats)))

    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format)
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe,
                                         kwargs['explode'],
                                         was_series=was_series,
                                         num_to_plot=num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', True)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis=1, errors='ignore')
    except:
        pass
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis=0, inplace=True, errors='ignore')
        else:
            warnings.warn(
                'No p-values calculated to show.\n\nUse sort_by and keep_stats in editor() to generate these values.'
            )
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis=0, inplace=True, errors='ignore')

    # make and set y label
    absolutes = True
    if type(dataframe) == pandas.core.frame.DataFrame:
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0, :].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):
            absolutes = False

    #  use colormap if need be:
    if num_to_plot > 0:
        if not was_series:
            if kind in ['pie', 'line', 'area']:
                if colours:
                    if not plotting_a_totals_column:
                        if colours == 'Default':
                            colours = 'Paired'
                        kwargs['colormap'] = colours
        #else:
            if colours:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    if piemode:
        if num_to_plot > 0:
            if colours == 'Default':
                colours = 'Paired'
            kwargs['colormap'] = colours
        else:
            if num_to_plot > 0:
                if colours == 'Default':
                    colours = 'Paired'
                kwargs['colormap'] = colours

    # multicoloured bar charts
    if colours:
        if kind.startswith('bar'):
            if len(list(dataframe.columns)) == 1:
                if not black_and_white:
                    import numpy as np
                    the_range = np.linspace(0, 1, num_to_plot)
                    middle = len(the_range) / 2
                    cmap = plt.get_cmap(colours)
                    kwargs['color'] = [cmap(n) for n in the_range][middle]
                # make a bar width ... ? ...
                #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # rotate automatically
    if 'rot' not in kwargs:
        if not was_series:
            xvals = [str(i) for i in list(dataframe.index)[:num_to_plot]]
            #if 'kind' in kwargs:
            #if kwargs['kind'] in ['barh', 'area']:
            #xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        else:
            xvals = [str(i) for i in list(dataframe.columns)[:num_to_plot]]
        if len(max(xvals, key=len)) > 6:
            if not piemode:
                kwargs['rot'] = 45
        else:
            kwargs['rot'] = False

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title

    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return

    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot / 7
        # kwarg options go in leg_options
        leg_options = {
            'framealpha': .8,
            'shadow': kwargs.get('shadow', False),
            'ncol': kwargs.pop('ncol', 1)
        }

        # determine legend position based on this dict
        if legend_pos:
            possible = {
                'best': 0,
                'upper right': 1,
                'upper left': 2,
                'lower left': 3,
                'lower right': 4,
                'right': 5,
                'center left': 6,
                'center right': 7,
                'lower center': 8,
                'upper center': 9,
                'center': 10,
                'o r': 2,
                'outside right': 2,
                'outside upper right': 2,
                'outside center right': 'center left',
                'outside lower right': 'lower left'
            }

            if type(legend_pos) == int:
                the_loc = legend_pos
            elif type(legend_pos) == str:
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError(
                        'legend_pos value must be one of:\n%s\n or an int between 0-10.'
                        % ', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)

        # a bit of distance between legend and plot for outside legends
        if type(legend_pos) == str:
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe,
                                               was_series=was_series,
                                               using_tex=using_tex,
                                               absolutes=absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe,
                                                   was_series=was_series,
                                                   using_tex=using_tex,
                                                   absolutes=absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.columns)
        else:
            if pie_legend:
                kwargs['legend'] = False
                if was_series:
                    leg_options['labels'] = list(dataframe.index)
                else:
                    leg_options['labels'] = list(dataframe.index)

    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1

    # convert dates --- works only in my current case!
    if plotting_a_totals_column or not was_series:
        try:
            can_it_be_int = int(list(dataframe.index)[0])
            can_be_int = True
        except:
            can_be_int = False
        if can_be_int:
            if 1500 < int(list(dataframe.index)[0]):
                if 2050 > int(list(dataframe.index)[0]):
                    n = pandas.PeriodIndex([d for d in list(dataframe.index)],
                                           freq='A')
                    dataframe = dataframe.set_index(n)

        if kwargs.get('filled'):
            if areamode or kind.startswith('bar'):
                dataframe = filler(dataframe)
            kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
        0: {
            'marker': None,
            'dash': (None, None)
        },
        1: {
            'marker': None,
            'dash': [5, 5]
        },
        2: {
            'marker': "o",
            'dash': (None, None)
        },
        3: {
            'marker': None,
            'dash': [1, 3]
        },
        4: {
            'marker': "s",
            'dash': [5, 2, 5, 2, 5, 10]
        },
        5: {
            'marker': None,
            'dash': [5, 3, 1, 2, 1, 10]
        },
        6: {
            'marker': 'o',
            'dash': (None, None)
        },
        7: {
            'marker': None,
            'dash': [5, 3, 1, 3]
        },
        8: {
            'marker': "1",
            'dash': [1, 3]
        },
        9: {
            'marker': "*",
            'dash': [5, 5]
        },
        10: {
            'marker': "2",
            'dash': [5, 2, 5, 2, 5, 10]
        },
        11: {
            'marker': "s",
            'dash': (None, None)
        }
    }

    HATCHES = {
        0: {
            'color': '#dfdfdf',
            'hatch': "/"
        },
        1: {
            'color': '#6f6f6f',
            'hatch': "\\"
        },
        2: {
            'color': 'b',
            'hatch': "|"
        },
        3: {
            'color': '#dfdfdf',
            'hatch': "-"
        },
        4: {
            'color': '#6f6f6f',
            'hatch': "+"
        },
        5: {
            'color': 'b',
            'hatch': "x"
        }
    }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs['colormap'] = new_cmap

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None

        def __exit__(self, one, two, three):
            return False

    with plt.style.context(
        (style)) if style != 'matplotlib' else dummy_context_mgr():

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            ax = dataframe.plot(figsize=figsize, **kwargs)
            if areamode:
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels
        else:
            plt.gcf().set_tight_layout(False)
            if not piemode:
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                ax = dataframe.plot(figsize=figsize, **kwargs)
                handles, labels = plt.gca().get_legend_handles_labels()
                plt.legend(handles,
                           labels,
                           loc=leg_options['loc'],
                           bbox_to_anchor=(0, -0.1, 1, 1),
                           bbox_transform=plt.gcf().transFigure)

                # this line allows layouts with missing plots
                # i.e. layout = (5, 2) with only nine plots
                plt.gcf().set_tight_layout(False)

        if 'rot' in kwargs:
            if kwargs['rot'] != 0 and kwargs['rot'] != 90:
                labels = [item.get_text() for item in ax.get_xticklabels()]
                ax.set_xticklabels(labels, rotation=kwargs['rot'], ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt:
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    if areamode:
                        handles = handles[-len(handles) / 2:]
                        labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    lgd = plt.legend(handles, labels, **leg_options)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(
                plt.gcf(),
                InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = [
                    '%s (%s: %d)' % (labels[i], x_val, y_val)
                    for x_val, y_val in zip(x_vals, y_vals)
                ]
            else:
                ls = [
                    '%s (%s: %.2f%%)' % (labels[i], x_val, y_val)
                    for x_val, y_val in zip(x_vals, y_vals)
                ]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(
                    lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l,
                                                                    labels=ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)

    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
        x_label = 'Year'

    if x_label is not False:
        if type(x_label) == str:
            plt.xlabel(x_label)
        else:
            check_x_axis = list(
                dataframe.index
            )[0]  # get first entry# get second entry of first entry (year, count)
            try:
                if type(dataframe.index) == pandas.tseries.period.PeriodIndex:
                    x_label = 'Year'
                check_x_axis = int(check_x_axis)
                if 1500 < check_x_axis < 2050:
                    x_label = 'Year'
                else:
                    x_label = 'Group'
            except:
                x_label = 'Group'

        if not sbplt:
            if not piemode:
                plt.xlabel(x_label)

    def is_number(s):
        """check if str can be can be made into float/int"""
        try:
            float(s)  # for int, long and float
        except ValueError:
            try:
                complex(s)  # for complex
            except ValueError:
                return False
        return True

    # for now, always turn off sci notation
    from matplotlib.ticker import ScalarFormatter
    if type(dataframe.index) != pandas.tseries.period.PeriodIndex:
        try:
            if all(is_number(s) for s in list(dataframe.index)):
                plt.gca().xaxis.set_major_formatter(ScalarFormatter())
        except:
            pass
    try:
        if all(is_number(s) for s in list(dataframe.columns)):
            plt.gca().yaxis.set_major_formatter(ScalarFormatter())
    except:
        pass

    # y labelling
    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'

    def suplabel(axis,
                 label,
                 label_prop=None,
                 labelpad=5,
                 ha='center',
                 va='center'):
        ''' Add super ylabel or xlabel to the figure
        Similar to matplotlib.suptitle
        axis       - string: "x" or "y"
        label      - string
        label_prop - keyword dictionary for Text
        labelpad   - padding from the axis (default: 5)
        ha         - horizontal alignment (default: "center")
        va         - vertical alignment (default: "center")
        '''
        fig = plt.gcf()
        xmin = []
        ymin = []
        for ax in fig.axes:
            xmin.append(ax.get_position().xmin)
            ymin.append(ax.get_position().ymin)
        xmin, ymin = min(xmin), min(ymin)
        dpi = fig.dpi
        if axis.lower() == "y":
            rotation = 90.
            x = xmin - float(labelpad) / dpi
            y = 0.5
        elif axis.lower() == 'x':
            rotation = 0.
            x = 0.5
            y = ymin - float(labelpad) / dpi
        else:
            raise Exception("Unexpected axis: x or y")
        if label_prop is None:
            label_prop = dict()
        plt.gcf().text(x,
                       y,
                       label,
                       rotation=rotation,
                       transform=fig.transFigure,
                       ha=ha,
                       va=va,
                       **label_prop)

    if y_label is not False:
        if not sbplt:
            if not piemode:
                if type(y_label) == str:
                    plt.ylabel(y_label)
                else:
                    plt.ylabel(y_l)
        else:
            if type(y_label) == str:
                the_y = y_label
            else:
                the_y = y_l
            #suplabel('y', the_y, labelpad = 1.5)
            plt.gcf().text(0.04, 0.5, the_y, va='center', rotation='vertical')
            #plt.subplots_adjust(left=0.5)

        #    if not piemode:
        #        if type(y_label) == str:
        #            plt.ylabel(y_label)
        #        else:
        #            plt.ylabel(y_l)

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)

        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            # show grid
            a.grid(b=kwargs.get('grid', False))
            kwargs.pop('grid', None)

    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:
        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

        # show grid
        ax.grid(b=kwargs.get('grid', False))
        kwargs.pop('grid', None)

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0, the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score),
                                 ha='center',
                                 va='bottom')
                else:
                    plt.annotate(score, (i, score), ha='center', va='bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score),
                                 ha='center',
                                 va='bottom')
                else:
                    plt.annotate(score, (i, score), ha='center', va='bottom')

    plt.subplots_adjust(left=0.1)
    plt.subplots_adjust(bottom=0.18)

    if 'layout' not in kwargs:
        if not sbplt:
            plt.tight_layout()

    if save:
        import os
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder,
                                save=save,
                                title=title,
                                ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o'):
            plt.gcf().savefig(savename,
                              dpi=150,
                              bbox_extra_artists=(lgd, ),
                              bbox_inches='tight',
                              format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()

    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    if not interactive and not running_python_tex and not running_spider \
        and not tk:
        plt.gcf().show()
        return
    elif running_spider or tk:
        return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()