コード例 #1
0
ファイル: stats.py プロジェクト: tranduythanh/corpkit
def shannon(self):
    from corpkit.interrogation import Interrogation

    def to_apply(ser):
        data = []
        import numpy as np
        import pandas as pd
        for word in ser.index:
            if not ser[word]:
                probability = np.nan
                self_information = np.nan
            else:
                probability = ser[word] / float(1.0 * len(ser))
                self_information = np.log2(1.0 / probability)
            data.append(self_information)
        return pd.Series(data, index=ser.index)

    res = getattr(self, 'results', self)
    appl = res.apply(to_apply, axis=1)
    ents = appl.sum(axis=1) / appl.shape[1]
    ents.name = 'Entropy'
    return Interrogation(results=appl, totals=ents)
コード例 #2
0
ファイル: interrogator.py プロジェクト: ashhher3/corpkit
def interrogator(corpus, 
    search='w', 
    query='any',
    show='w',
    exclude=False,
    excludemode='any',
    searchmode='all',
    case_sensitive=False,
    save=False,
    subcorpora=False,
    just_metadata=False,
    skip_metadata=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    only_unique=False,
    only_format_match=True,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r'[A-Za-z0-9]',
    gramsize=1,
    conc=False,
    maxconc=9999,
    window=None,
    no_closed=False,
    no_punct=True,
    discard=False,
    **kwargs):
    """
    Interrogate corpus, corpora, subcorpus and file objects.
    See corpkit.interrogation.interrogate() for docstring
    """
    
    conc = kwargs.get('do_concordancing', conc)
    quiet = kwargs.get('quiet', False)
    coref = kwargs.pop('coref', False)
    show_conc_metadata = kwargs.pop('show_conc_metadata', False)
    fsi_index = kwargs.pop('fsi_index', True)
    dep_type = kwargs.pop('dep_type', 'collapsed-ccprocessed-dependencies')

    nosubmode = subcorpora is None
    #todo: temporary
    #if getattr(corpus, '_dlist', False):
    #    subcorpora = 'file'

    # store kwargs and locs
    locs = locals().copy()
    locs.update(kwargs)
    locs.pop('kwargs', None)

    import codecs
    import signal
    import os
    from time import localtime, strftime
    from collections import Counter

    import pandas as pd
    from pandas import DataFrame, Series

    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.corpus import Datalist, Corpora, Corpus, File, Subcorpus
    from corpkit.process import (tregex_engine, get_deps, unsplitter, sanitise_dict, 
                                 animator, filtermaker, fix_search,
                                 pat_format, auto_usecols, format_tregex,
                                 make_conc_lines_from_whole_mid)
    from corpkit.other import as_regex
    from corpkit.dictionaries.process_types import Wordlist
    from corpkit.build import check_jdk
    from corpkit.conll import pipeline
    from corpkit.process import delete_files_and_subcorpora
    
    have_java = check_jdk()

    # remake corpus without bad files and folders 
    corpus, skip_metadata, just_metadata = delete_files_and_subcorpora(corpus, skip_metadata, just_metadata)

    # so you can do corpus.interrogate('features/postags/wordclasses/lexicon')
    if search == 'features':
        search = 'v'
        query = 'any'
    if search in ['postags', 'wordclasses']:
        query = 'any'
        preserve_case = True
        show = 'p' if search == 'postags' else 'x'
        # use tregex if simple because it's faster
        # but use dependencies otherwise
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
    if search == 'lexicon':
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
        query = 'any'
        show = ['w']

    if not kwargs.get('cql') and isinstance(search, STRINGTYPE) and len(search) > 3:
        raise ValueError('search argument not recognised.')

    import re
    if regex_nonword_filter:
        is_a_word = re.compile(regex_nonword_filter)
    else:
        is_a_word = re.compile(r'.*')

    from traitlets import TraitError

    # convert cql-style queries---pop for the sake of multiprocessing
    cql = kwargs.pop('cql', None)
    if cql:
        from corpkit.cql import to_corpkit
        search, exclude = to_corpkit(search)

    def signal_handler(signal, _):
        """
        Allow pausing and restarting whn not in GUI
        """
        if root:
            return  
        import signal
        import sys
        from time import localtime, strftime
        signal.signal(signal.SIGINT, original_sigint)
        thetime = strftime("%H:%M:%S", localtime())
        INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
        time = strftime("%H:%M:%S", localtime())
        print('%s: Interrogation resumed.\n' % time)
        signal.signal(signal.SIGINT, signal_handler)

    def add_adj_for_ngram(show, gramsize):
        """
        If there's a gramsize of more than 1, remake show
        for ngramming
        """
        if gramsize == 1:
            return show
        out = []
        for i in show:
            out.append(i)
        for i in range(1, gramsize):
            for bit in show:
                out.append('+%d%s' % (i, bit))
        return out

    def fix_show_bit(show_bit):
        """
        Take a single search/show_bit type, return match
        """
        ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's', 'a', 'e', 'c']
        starts = ['d', 'g', 'm', 'b', 'h', '+', '-', 'r', 'c']
        show_bit = show_bit.lstrip('n')
        show_bit = show_bit.lstrip('b')
        show_bit = list(show_bit)
        if show_bit[-1] not in ends:
            show_bit.append('w')
        if show_bit[0] not in starts:
            show_bit.insert(0, 'm')
        return ''.join(show_bit)

    def fix_show(show, gramsize):
        """
        Lowercase anything in show and turn into list
        """
        if isinstance(show, list):
            show = [i.lower() for i in show]
        elif isinstance(show, STRINGTYPE):
            show = show.lower()
            show = [show]
        show = [fix_show_bit(i) for i in show]
        return add_adj_for_ngram(show, gramsize)

    def is_multiquery(corpus, search, query, outname):
        """
        Determine if multiprocessing is needed/possibe, and 
        do some retyping if need be as well
        """
        is_mul = False
        from collections import OrderedDict
        from corpkit.dictionaries.process_types import Wordlist
        
        if isinstance(query, Wordlist):
            query = list(query)

        if subcorpora and multiprocess:
            is_mul = 'subcorpora'

        if isinstance(subcorpora, (list, tuple)):
            is_mul = 'subcorpora'

        if isinstance(query, (dict, OrderedDict)):
            is_mul = 'namedqueriessingle'
        
        if isinstance(search, dict):
            if all(isinstance(i, dict) for i in list(search.values())):
                is_mul = 'namedqueriesmultiple'
        return is_mul, corpus, search, query

    def ispunct(s):
        import string
        return all(c in string.punctuation for c in s)

    def uniquify(conc_lines):
        """get unique concordance lines"""
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (_, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def compiler(pattern):
        """
        Compile regex or fail gracefully
        """
        if hasattr(pattern, 'pattern'):
            return pattern
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def determine_search_func(show):
        """Figure out what search function we're using"""

        simple_tregex_mode = False
        statsmode = False
        tree_to_text = False
        search_trees = False
            
        simp_crit = all(not i for i in [kwargs.get('tgrep'),
                                        files_as_subcorpora,
                                        subcorpora,
                                        just_metadata,
                                        skip_metadata])

        if search.get('t') and simp_crit:
            if have_java:
                simple_tregex_mode = True
            else:
                search_trees = 'tgrep'
            optiontext = 'Searching parse trees'

        elif datatype == 'conll':
        
            if any(i.endswith('t') for i in search.keys()):
                if have_java and not kwargs.get('tgrep'):
                    search_trees = 'tregex'
                else:
                    search_trees = 'tgrep'
                optiontext = 'Searching parse trees'
            elif any(i.endswith('v') for i in search.keys()):
                # either of these searchers now seems to work
                #seacher = get_stats_conll
                statsmode = True
                optiontext = 'General statistics'
            elif any(i.endswith('r') for i in search.keys()):
                optiontext = 'Distance from root'
            else:
                optiontext = 'Querying CONLL data'

        return optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees

    def get_tregex_values(show):
        """If using Tregex, set appropriate values

        - Check for valid query
        - Make 'any' query
        - Make list query
        """

        translated_option = 't'
        if isinstance(search['t'], Wordlist):
            search['t'] = list(search['t'])
        q = tregex_engine(corpus=False,
                          query=search.get('t'),
                          options=['-t'],
                          check_query=True,
                          root=root,
                          preserve_case=preserve_case
                         )

        # so many of these bad fixing loops!
        nshow = []
        for i in show:
            if i == 'm':
                nshow.append('w')
            else:
                nshow.append(i.lstrip('m'))
        show = nshow

        if q is False:
            if root:
                return 'Bad query', None
            else:
                return 'Bad query', None

        if isinstance(search['t'], list):
            regex = as_regex(search['t'], boundaries='line', case_sensitive=case_sensitive)
        else:
            regex = ''

        # listquery, anyquery, translated_option
        treg_dict = {'p': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'pl': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'x': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     't': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'o'],
                     'w': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'c': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'C'],
                     'l': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'u': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'v']
                    }

        newshow = []

        listq, anyq, translated_option = treg_dict.get(show[0][-1].lower())
        newshow.append(translated_option)
        for item in show[1:]:
            _, _, noption = treg_dict.get(item.lower())
            newshow.append(noption)

        if isinstance(search['t'], list):
            search['t'] = listq
        elif search['t'] == 'any':   
            search['t'] = anyq
        return search['t'], newshow

    def correct_spelling(a_string):
        """correct spelling within a string"""
        if not spelling:
            return a_string
        from corpkit.dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def make_search_iterable(corpus):
        """determine how to structure the corpus for interrogation"""
        # skip file definitions if they are not needed
        if getattr(corpus, '_dlist', False):

            return {(i.name, i.path): [i] for i in list(corpus.files)}
            #return {('Sample', 'Sample'): list(corpus.files)}

        if simple_tregex_mode:
            if corpus.level in ['s', 'f', 'd']:
                return {(corpus.name, corpus.path): False}
            else:
                return {(os.path.basename(i), os.path.join(corpus.path, i)): False
                    for i in os.listdir(corpus.path)
                    if os.path.isdir(os.path.join(corpus.path, i))}

        if isinstance(corpus, Datalist):
            to_iterate_over = {}
            # it could be files or subcorpus objects
            if corpus[0].level in ['s', 'd']:
                if files_as_subcorpora:
                    for subc in corpus:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subc in corpus:
                        to_iterate_over[(subc.name, subc.path)] = subc.files
            elif corpus[0].level == 'f':
                for f in corpus:
                    to_iterate_over[(f.name, f.path)] = [f]
        elif corpus.singlefile:
            to_iterate_over = {(corpus.name, corpus.path): [corpus]}
        elif not hasattr(corpus, 'subcorpora') or not corpus.subcorpora:
            # just files in a directory
            if files_as_subcorpora:
                to_iterate_over = {}
                for f in corpus.files:
                    to_iterate_over[(f.name, f.path)] = [f]
            else:
                to_iterate_over = {(corpus.name, corpus.path): corpus.files}
        else:
            to_iterate_over = {}
            if files_as_subcorpora:
                # don't know if possible: has subcorpora but also .files
                if hasattr(corpus, 'files') and corpus.files is not None:
                    for f in corpus.files:
                        to_iterate_over[(f.name, f.path)] = [f]
                # has subcorpora with files in those
                elif hasattr(corpus, 'files') and corpus.files is None:
                    for subc in corpus.subcorpora:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
            else:
                if corpus[0].level == 's':
                    for subcorpus in corpus:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
                elif corpus[0].level == 'f':
                    for f in corpus:
                        to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subcorpus in corpus.subcorpora:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        return to_iterate_over

    def welcome_printer(return_it=False):
        """Print welcome message"""
        if no_conc:
            message = 'Interrogating'
        else:
            message = 'Interrogating and concordancing'
        if only_conc:
            message = 'Concordancing'
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            from corpkit.process import dictformat
            sformat = dictformat(search)
            welcome = ('\n%s: %s %s ...\n          %s\n          ' \
                        'Query: %s\n          %s corpus ... \n' % \
                      (thetime, message, cname, optiontext, sformat, message))
            if return_it:
                return welcome
            else:
                print(welcome)

    def goodbye_printer(return_it=False, only_conc=False):
        """Say goodbye before exiting"""
        if not kwargs.get('printstatus', True):
            return
        thetime = strftime("%H:%M:%S", localtime())
        if only_conc:
            finalstring = '\n\n%s: Concordancing finished! %s results.' % (thetime, format(len(conc_df), ','))
        else:
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %s matches.' % format(tot, ',')
            else:
                finalstring += ' %s unique results, %s total occurrences.' % (format(numentries, ','), format(total_total, ','))
        if return_it:
            return finalstring
        else:
            print(finalstring)

    def get_conc_colnames(corpus,
                          fsi_index=False,
                          simple_tregex_mode=False):
    
        fields = []
        base = 'c f s l m r'
        
        if simple_tregex_mode:
            base = base.replace('f ', '')

        if fsi_index and not simple_tregex_mode:
            base = 'i ' + base
        
        if PYTHON_VERSION == 2:
            base = base.encode('utf-8').split()
        else:
            base = base.split() 

        if show_conc_metadata:
            from corpkit.build import get_all_metadata_fields
            meta = get_all_metadata_fields(corpus.path)

            if isinstance(show_conc_metadata, list):
                meta = [i for i in meta if i in show_conc_metadata]
            #elif show_conc_metadata is True:
            #    pass
            for i in sorted(meta):
                if i in ['speaker', 'sent_id', 'parse']:
                    continue
                if PYTHON_VERSION == 2:
                    base.append(i.encode('utf-8'))
                else:
                    base.append(i)
        return base

    def make_conc_obj_from_conclines(conc_results, fsi_index=False):
        """
        Turn conclines into DataFrame
        """
        from corpkit.interrogation import Concordance
        #fsi_place = 2 if fsi_index else 0

        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            for lin in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                #if not subcorpora:
                #    lin[fsi_place] = lin[fsi_place]
                #lin.insert(fsi_place, sc_name)

                if len(lin) < len(conc_col_names):
                    diff = len(conc_col_names) - len(lin)
                    lin.extend(['none'] * diff)

                all_conc_lines.append(Series(lin, index=conc_col_names))

        try:
            conc_df = pd.concat(all_conc_lines, axis=1).T
        except ValueError:
            return
        
        if all(x == '' for x in list(conc_df['s'].values)) or \
           all(x == 'none' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis=1, inplace=True)

        locs['corpus'] = corpus.name

        if maxconc:
            conc_df = Concordance(conc_df[:maxconc])
        else:
            conc_df = Concordance(conc_df)
        try:
            conc_df.query = locs
        except AttributeError:
            pass
        return conc_df

    def lowercase_result(res):
        """      
        Take any result and do spelling/lowercasing if need be

        todo: remove lowercase and change name
        """
        if not res or statsmode:
            return res
        # this is likely broken, but spelling in interrogate is deprecated anyway
        if spelling:
            res = [correct_spelling(r) for r in res]
        return res

    def postprocess_concline(line, fsi_index=False, conc=False):
        # todo: are these right?
        if not conc:
            return line
        subc, star, en = 0, 2, 5
        if fsi_index:
            subc, star, en = 2, 4, 7
        if not preserve_case:
            line[star:en] = [str(x).lower() for x in line[star:en]]
        if spelling:
            line[star:en] = [correct_spelling(str(b)) for b in line[star:en]]
        return line

    def make_progress_bar():
        """generate a progress bar"""

        if simple_tregex_mode:
            total_files = len(list(to_iterate_over.keys()))
        else:
            total_files = sum(len(x) for x in list(to_iterate_over.values()))

        par_args = {'printstatus': kwargs.get('printstatus', True),
                    'root': root, 
                    'note': note,
                    'quiet': quiet,
                    'length': total_files,
                    'startnum': kwargs.get('startnum'),
                    'denom': kwargs.get('denominator', 1)}

        term = None
        if kwargs.get('paralleling', None) is not None:
            from blessings import Terminal
            term = Terminal()
            par_args['terminal'] = term
            par_args['linenum'] = kwargs.get('paralleling')

        if in_notebook:
            par_args['welcome_message'] = welcome_message

        outn = kwargs.get('outname', '')
        if outn:
            outn = getattr(outn, 'name', outn)
            outn = outn + ': '

        tstr = '%s%d/%d' % (outn, current_iter, total_files)
        p = animator(None, None, init=True, tot_string=tstr, **par_args)
        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
        animator(p, current_iter, tstr, **par_args)
        return p, outn, total_files, par_args

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')
    language_model = kwargs.get('language_model')

    # set up pause method
    original_sigint = signal.getsignal(signal.SIGINT)
    if kwargs.get('paralleling', None) is None:
        if not root:
            original_sigint = signal.getsignal(signal.SIGINT)
            signal.signal(signal.SIGINT, signal_handler)

    # find out about concordancing
    only_conc = False
    no_conc = False
    if conc is False:
        no_conc = True
    if isinstance(conc, str) and conc.lower() == 'only':
        only_conc = True
        no_conc = False
    numconc = 0

    # wipe non essential class attributes to not bloat query attrib
    if isinstance(corpus, Corpus):
        import copy
        corpus = copy.copy(corpus)
        for k, v in corpus.__dict__.items():
            if isinstance(v, (Interrogation, Interrodict)):
                corpus.__dict__.pop(k, None)

    # convert path to corpus object
    if not isinstance(corpus, (Corpus, Corpora, Subcorpus, File, Datalist)):
        if not multiprocess and not kwargs.get('outname'):
            corpus = Corpus(corpus, print_info=False)

    # figure out how the user has entered the query and show, and normalise
    from corpkit.process import searchfixer
    search = searchfixer(search, query)
    show = fix_show(show, gramsize)
    locs['show'] = show

    # instantiate lemmatiser if need be
    lem_instance = False
    if any(i.endswith('l') for i in show) and isinstance(search, dict) and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lem_instance = WordNetLemmatizer()

    # do multiprocessing if need be
    im, corpus, search, query, = is_multiquery(corpus, search, query, 
                                                             kwargs.get('outname', False))

    # figure out if we can multiprocess the corpus
    if hasattr(corpus, '__iter__') and im:
        corpus = Corpus(corpus, print_info=False)
    if hasattr(corpus, '__iter__') and not im:
        im = 'datalist'
    if isinstance(corpus, Corpora):
        im = 'multiplecorpora'

    # split corpus if the user wants multiprocessing but no other iterable
    if not im and multiprocess:
        im = 'datalist'
        if getattr(corpus, 'subcorpora', False):
            corpus = corpus[:]
        else:
            corpus = corpus.files

    search = fix_search(search, case_sensitive=case_sensitive, root=root)
    exclude = fix_search(exclude, case_sensitive=case_sensitive, root=root)

    # if it's already been through pmultiquery, don't do it again
    locs['search'] = search
    locs['exclude'] = exclude
    locs['query'] = query
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess
    locs['print_info'] = kwargs.get('printstatus', True)
    locs['multiple'] = im
    locs['subcorpora'] = subcorpora
    locs['nosubmode'] = nosubmode

    # send to multiprocess function
    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from corpkit.multiprocess import pmultiquery
        return pmultiquery(**locs)

    # get corpus metadata
    cname = corpus.name
    if isinstance(save, STRINGTYPE):
        savename = corpus.name + '-' + save
    if save is True:
        raise ValueError('save must be str, not bool.')


    datatype = getattr(corpus, 'datatype', 'conll')
    singlefile = getattr(corpus, 'singlefile', False)
    level = getattr(corpus, 'level', 'c')
        
    # store all results in here
    from collections import defaultdict
    results = defaultdict(Counter)
    count_results = defaultdict(list)
    conc_results = defaultdict(list)

    # check if just counting, turn off conc if so
    countmode = 'c' in show or 'mc' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    # Determine the search function to be used #
    optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees = determine_search_func(show)
    
    # no conc for statsmode
    if statsmode:
        no_conc = True
        only_conc = False
        conc = False

    # Set some Tregex-related values
    translated_option = False
    if search.get('t'):
        query, translated_option = get_tregex_values(show)
        if query == 'Bad query' and translated_option is None:
            if root:
                return 'Bad query'
            else:
                return
    # more tregex options
    if tree_to_text:
        treg_q = r'ROOT << __'
        op = ['-o', '-t', '-w', '-f']
    elif simple_tregex_mode:
        treg_q = search['t']
        op = ['-%s' % i for i in translated_option] + ['-o', '-f']

    # make iterable object for corpus interrogation
    to_iterate_over = make_search_iterable(corpus)

    try:
        from ipywidgets import IntProgress
        _ = IntProgress(min=0, max=10, value=1)
        in_notebook = True
    except TraitError:
        in_notebook = False
    except ImportError:
        in_notebook = False
    # caused in newest ipython
    except AttributeError:
        in_notebook = False

    lemtag = False
    if search.get('t'):
        from corpkit.process import gettag
        lemtag = gettag(search.get('t'), lemmatag)

    usecols = auto_usecols(search, exclude, show, kwargs.pop('usecols', None), coref=coref)

    # print welcome message
    welcome_message = welcome_printer(return_it=in_notebook)

    # create a progress bar
    p, outn, total_files, par_args = make_progress_bar()

    if conc:
        conc_col_names = get_conc_colnames(corpus,
                                           fsi_index=fsi_index,
                                           simple_tregex_mode=False)

 

    # Iterate over data, doing interrogations
    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):
        if nosubmode:
            subcorpus_name = 'Total'

        # results for subcorpus go here
        #conc_results[subcorpus_name] = []
        #count_results[subcorpus_name] = []
        #results[subcorpus_name] = Counter()

        # get either everything (tree_to_text) or the search['t'] query
        if tree_to_text or simple_tregex_mode:
            result = tregex_engine(query=treg_q,
                                   options=op,
                                   corpus=subcorpus_path,
                                   root=root,
                                   preserve_case=preserve_case)

            # format search results with slashes etc
            if not countmode and not tree_to_text:
                result = format_tregex(result, show, translated_option=translated_option,
                            exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False)

            # if concordancing, do the query again with 'whole' sent and fname
            if not no_conc:
                ops = ['-w'] + op
                #ops = [i for i in ops if i != '-n']
                whole_result = tregex_engine(query=search['t'],
                                             options=ops,
                                             corpus=subcorpus_path,
                                             root=root,
                                             preserve_case=preserve_case
                                            )

                # format match too depending on option
                if not only_format_match:
                    wholeresult = format_tregex(whole_result, show, translated_option=translated_option,
                                exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False, whole=True)

                # make conc lines from conc results
                conc_result = make_conc_lines_from_whole_mid(whole_result, result, show=show)
                for lin in conc_result:
                    if maxconc is False or numconc < maxconc:
                        conc_results[subcorpus_name].append(lin)
                    numconc += 1

            # add matches to ongoing counts
            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                if result:
                    results[subcorpus_name] += Counter([i[-1] for i in result])
                else:
                    results[subcorpus_name] += Counter()

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)
            continue

        # todo: move this
        kwargs.pop('by_metadata', None)
        
        # conll querying goes by file, not subcorpus
        for f in files:
            slow_treg_speaker_guess = kwargs.get('outname', '') if kwargs.get('multispeaker') else ''
            filepath, corefs = f.path, coref
            res, conc_res = pipeline(filepath, search=search, show=show,
                                     dep_type=dep_type,
                                     exclude=exclude,
                                     excludemode=excludemode,
                                     searchmode=searchmode,
                                     case_sensitive=case_sensitive,
                                     conc=conc,
                                     only_format_match=only_format_match,
                                     speaker=slow_treg_speaker_guess,
                                     gramsize=gramsize,
                                     no_punct=no_punct,
                                     no_closed=no_closed,
                                     window=window,
                                     filename=f.path,
                                     coref=corefs,
                                     countmode=countmode,
                                     maxconc=(maxconc, numconc),
                                     is_a_word=is_a_word,
                                     by_metadata=subcorpora,
                                     show_conc_metadata=show_conc_metadata,
                                     just_metadata=just_metadata,
                                     skip_metadata=skip_metadata,
                                     fsi_index=fsi_index,
                                     category=subcorpus_name,
                                     translated_option=translated_option,
                                     statsmode=statsmode,
                                     preserve_case=preserve_case,
                                     usecols=usecols,
                                     search_trees=search_trees,
                                     lem_instance=lem_instance,
                                     lemtag=lemtag,
                                     **kwargs)

            if res is None and conc_res is None:
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # deal with symbolic structures---that is, rather than adding
            # results by subcorpora, add them by metadata value
            # todo: sorting?
            if subcorpora:
                for (k, v), concl in zip(res.items(), conc_res.values()):                            
                    v = lowercase_result(v)
                    results[k] += Counter(v)
                    for line in concl:
                        if maxconc is False or numconc < maxconc:
                            line = postprocess_concline(line,
                                fsi_index=fsi_index, conc=conc)
                            conc_results[k].append(line)
                            numconc += 1
                
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # garbage collection needed?
            sents = None
            corefs = None
                
            if res == 'Bad query':
                return 'Bad query'

            if countmode:
                count_results[subcorpus_name] += [res]

            else:
                # add filename and do lowercasing for conc
                if not no_conc:
                    for line in conc_res:
                        line = postprocess_concline(line,
                            fsi_index=fsi_index, conc=conc)
                        if maxconc is False or numconc < maxconc:
                            conc_results[subcorpus_name].append(line)
                            numconc += 1

                # do lowercasing and spelling
                if not only_conc:
                    res = lowercase_result(res)
                    # discard removes low results, helping with 
                    # curse of dimensionality
                    countres = Counter(res)
                    if isinstance(discard, float):
                        countres.most_common()
                        nkeep = len(counter) - len(counter) * discard
                        countres = Counter({k: v for i, (k, v) in enumerate(countres.most_common()) if i <= nkeep})
                    elif isinstance(discard, int):
                        countres = Counter({k: v for k, v in countres.most_common() if v >= discard})
                    results[subcorpus_name] += countres
                    #else:
                    #results[subcorpus_name] += res

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

    # Get concordances into DataFrame, return if just conc
    if not no_conc:
        # fail on this line with typeerror if no results?
        conc_df = make_conc_obj_from_conclines(conc_results, fsi_index=fsi_index)
        if only_conc and conc_df is None:
            return
        elif only_conc:
            locs = sanitise_dict(locs)
            try:
                conc_df.query = locs
            except AttributeError:
                return conc_df
            if save and not kwargs.get('outname'):
                if conc_df is not None:
                    conc_df.save(savename)
            goodbye_printer(only_conc=True)
            if not root:
                signal.signal(signal.SIGINT, original_sigint)            
            return conc_df
    else:
        conc_df = None

    # Get interrogation into DataFrame
    if countmode:
        df = Series({k: sum(v) for k, v in sorted(count_results.items())})
        tot = df.sum()
    else:
        the_big_dict = {}
        unique_results = set(item for sublist in list(results.values()) for item in sublist)
        sortres = sorted(results.items(), key=lambda x: x[0])
        for word in unique_results:
            the_big_dict[word] = [subcorp_result[word] for _, subcorp_result in sortres]
        # turn master dict into dataframe, sorted
        df = DataFrame(the_big_dict, index=sorted(results.keys()))

        # for ngrams, remove hapaxes
        #if show_ngram or show_collocates:
        #    if not language_model:
        #        df = df[[i for i in list(df.columns) if df[i].sum() > 1]]

        numentries = len(df.columns)
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    # turn df into series if all conditions met
    conds = [countmode,
             files_as_subcorpora,
             subcorpora,
             kwargs.get('df1_always_df', False)]
    anyxs = [level == 's',
             singlefile,
             nosubmode]
    if all(not x for x in conds) and any(x for x in anyxs):
        df = Series(df.ix[0])
        df.sort_values(ascending=False, inplace=True)
        tot = df.sum()
        numentries = len(df.index)
        total_total = tot

    # turn data into DF for GUI if need be
    if isinstance(df, Series) and kwargs.get('df1_always_df', False):
        total_total = df.sum()
        df = DataFrame(df)
        tot = Series(total_total, index=['Total'])

    # if we're doing files as subcorpora,  we can remove the extension etc
    if isinstance(df, DataFrame) and files_as_subcorpora:
        cname = corpus.name.replace('-stripped', '').replace('-parsed', '')
        edits = [(r'(-[0-9][0-9][0-9])?\.txt\.conllu?', ''),
                 (r'-%s(-stripped)?(-parsed)?' % cname, '')]
        from corpkit.editor import editor
        df = editor(df, replace_subcorpus_names=edits).results
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    if conc_df is not None and conc_df is not False:
        # removed 'f' from here for now
        for col in ['c']:
            for pat in ['.txt', '.conll', '.conllu']:
                conc_df[col] = conc_df[col].str.replace(pat, '')
            conc_df[col] = conc_df[col].str.replace(r'-[0-9][0-9][0-9]$', '')

        #df.index = df.index.str.replace('w', 'this')

    # make interrogation object
    locs['corpus'] = corpus.path
    locs = sanitise_dict(locs)
    if nosubmode and isinstance(df, pd.DataFrame):
        df = df.sum()
    interro = Interrogation(results=df, totals=tot, query=locs, concordance=conc_df)

    # save it
    if save and not kwargs.get('outname'):
        print('\n')
        interro.save(savename)
    
    goodbye = goodbye_printer(return_it=in_notebook)
    if in_notebook:
        try:
            p.children[2].value = goodbye.replace('\n', '')
        except AttributeError:
            pass
    if not root:
        signal.signal(signal.SIGINT, original_sigint)
    return interro
コード例 #3
0
ファイル: interrogation.py プロジェクト: tranduythanh/corpkit
    def multiindex(self):
        """Create a `pandas.MultiIndex` version of results.

        :Example:

        >>> d = corpora.interrogate({F: 'compound', GL: '^risk'}, show=L)
        >>> d.keys()
            ['CHT', 'WAP', 'WSJ']
        >>> d['CHT'].results
            ....  health  cancer  security  credit  flight  safety  heart
            1987      87      25        28      13       7       6      4
            1988      72      24        20      15       7       4      9
            1989     137      61        23      10       5       5      6
        >>> d.multiindex().results
            ...               health  cancer  credit  security  downside  
            Corpus Subcorpus                                             
            CHT    1987           87      25      13        28        20 
                   1988           72      24      15        20        12 
                   1989          137      61      10        23        10                                                      
            WAP    1987           83      44       8        44        10 
                   1988           83      27      13        40         6 
                   1989           95      77      18        25        12 
            WSJ    1987           52      27      33         4        21 
                   1988           39      11      37         9        22 
                   1989           55      47      43         9        24 

        :returns: A :class:`corpkit.interrogation.Interrogation`
        """

        import pandas as pd
        import numpy as np
        from itertools import product
        from corpkit.interrogation import Interrodict, Interrogation

        query = self.query

        def trav(dct,
                 parents={},
                 level=0,
                 colset=set(),
                 results=list(),
                 myparname=[]):
            from collections import defaultdict

            columns = False
            if hasattr(dct, 'items'):
                parents[level] = list(dct.keys())
                level += 1
                for k, v in list(dct.items()):
                    pars = myparname + [k]
                    # the below is only for python3
                    #pars = [*myparname, k]
                    trav(v,
                         parents=parents,
                         level=level,
                         results=results,
                         myparname=pars)
            else:
                if parents.get(level):
                    parents[level] |= set(dct.results.index)
                else:
                    parents[level] = set(dct.results.index)
                if not dct.results.empty:
                    for n, ser in dct.results.iterrows():
                        ser.name = tuple(myparname + [ser.name])
                        #ser.name = (*myparname, ser.name)
                        results.append(ser)
                    for c in list(dct.results.columns):
                        colset.add(c)
                    level += 1
            return results

        data = trav(self)
        index = [i.name for i in data]

        # todo: better default for speakers?
        if isinstance(self.query, dict) and self.query.get('subcorpora'):
            nms = {'names': self.query['subcorpora']}
        else:
            nms = {}
        ix = pd.MultiIndex.from_tuples(index, **nms)
        df = pd.DataFrame(data, index=ix)
        df = df.fillna(0).astype(int)
        df = df[df.sum().sort_values(ascending=False).index]
        totals = df.sum(axis=1)
        return Interrogation(results=df, totals=totals, query=query)
コード例 #4
0
ファイル: other.py プロジェクト: tranduythanh/corpkit
def make_multi(interrogation, indexnames=None):
    """
    make pd.multiindex version of an interrogation (for pandas geeks)

    :param interrogation: a corpkit interrogation
    :type interrogation: a corpkit interrogation, pd.DataFrame or pd.Series

    :param indexnames: pass in a list of names for the multiindex;
                       leave as None to get them if possible from interrogation
                       use False to explicitly not get them
    :type indexnames: list of strings/None/False
    :returns: pd.DataFrame with multiindex"""

    # get proper names for index if possible
    from corpkit.constants import transshow, transobjs

    import numpy as np
    import pandas as pd

    # if it's an interrodict, we want to make it into a single df
    import corpkit
    from corpkit.interrogation import Interrodict, Interrogation

    seriesmode = False

    if isinstance(interrogation, (Interrodict, dict)):
        import pandas as pd
        import numpy as np

        flat = [[], [], []]

        for name, data in list(interrogation.items()):
            for subcorpus in list(data.results.index):
                # make multiindex
                flat[0].append(name)
                flat[1].append(subcorpus)
                # add results
                flat[2].append(data.results.ix[subcorpus])

        flat[0] = np.array(flat[0])
        flat[1] = np.array(flat[1])

        df = pd.DataFrame(flat[2], index=flat[:2])
        if indexnames is None:
            indexnames = ['Corpus', 'Subcorpus']
        df.index.names = indexnames
        df = df.fillna(0)
        df = df.T
        df[('Total', 'Total')] = df.sum(axis=1)
        df = df.sort_values(by=('Total', 'Total'), ascending=False).drop(
            ('Total', 'Total'), axis=1).T
        try:
            df = df.astype(int)
        except:
            pass
        return Interrogation(df, df.sum(axis=1),
                             getattr(interrogation, 'query', None))
    # determine datatype, get df and cols
    rows = False
    if isinstance(interrogation, pd.core.frame.DataFrame):
        df = interrogation
        cols = list(interrogation.columns)
        rows = list(interrogation.index)
    elif isinstance(interrogation, pd.core.series.Series):
        cols = list(interrogation.index)
        seriesmode = True
        df = pd.DataFrame(interrogation).T
    elif isinstance(interrogation, Interrogation):
        df = interrogation.results
        if isinstance(df, pd.core.series.Series):
            cols = list(df.index)
            seriesmode = True
            df = pd.DataFrame(df).T
        else:
            cols = list(df.columns)
            rows = list(df.index)

        # set indexnames if we have them
        if indexnames is not False:
            if interrogation.query.get('show'):
                indexnames = []
                ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's']
                for showval in interrogation.query['show']:
                    if len(showval) == 1:
                        if showval in ends:
                            showval = 'm' + showval
                        else:
                            showval = showval + 'w'
                    a = transobjs.get(showval[0], showval[0])
                    b = transshow.get(showval[-1], showval[-1])
                    indexstring = '%s %s' % (a, b.lower())
                    indexnames.append(indexstring)
            else:
                indexnames = False

    # split column names on slash
    for index, i in enumerate(cols):
        cols[index] = i.split('/')

    # make numpy arrays
    arrays = []
    for i in range(len(cols[0])):
        arrays.append(np.array([x[i] for x in cols]))

    # make output df, add names if we have them
    newdf = pd.DataFrame(df.T.as_matrix(), index=arrays).T
    if indexnames:
        newdf.columns.names = indexnames

    if rows:
        newdf.index = rows

    pd.set_option('display.multi_sparse', False)
    totals = newdf.sum(axis=1)
    query = interrogation.query
    conco = getattr(interrogation, 'concordance', None)
    return Interrogation(newdf, totals, query, conco)
コード例 #5
0
def editor(interrogation,
           operation=None,
           denominator=False,
           sort_by=False,
           keep_stats=False,
           keep_top=False,
           just_totals=False,
           threshold='medium',
           just_entries=False,
           skip_entries=False,
           span_entries=False,
           merge_entries=False,
           just_subcorpora=False,
           skip_subcorpora=False,
           span_subcorpora=False,
           merge_subcorpora=False,
           replace_names=False,
           replace_subcorpus_names=False,
           projection=False,
           remove_above_p=False,
           p=0.05,
           print_info=False,
           spelling=False,
           selfdrop=True,
           calc_all=True,
           keyword_measure='ll',
           **kwargs):
    """
    See corpkit.interrogation.Interrogation.edit() for docstring
    """

    # grab arguments, in case we get dict input and have to iterate
    locs = locals()

    import corpkit

    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime

    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    # new ipython error
    except AttributeError:
        have_ipython = False
        pass

    # to use if we also need to worry about concordance lines
    return_conc = False

    from corpkit.interrogation import Interrodict, Interrogation, Concordance
    if interrogation.__class__ == Interrodict:
        locs.pop('interrogation', None)
        from collections import OrderedDict
        outdict = OrderedDict()
        for i, (k, v) in enumerate(interrogation.items()):
            # only print the first time around
            if i != 0:
                locs['print_info'] = False

            if isinstance(denominator,
                          STRINGTYPE) and denominator.lower() == 'self':
                denominator = interrogation

            # if df2 is also a dict, get the relevant entry

            if isinstance(denominator, (dict, Interrodict)):
                #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \
                #   sorted(set([i.lower() for i in list(denominator.keys())])):
                #   locs['denominator'] = denominator[k]

                # fix: this repeats itself for every key, when it doesn't need to
                # denominator_sum:
                if kwargs.get('denominator_sum'):
                    locs['denominator'] = denominator.collapse(axis='key')

                if kwargs.get('denominator_totals'):
                    locs['denominator'] = denominator[k].totals
                else:
                    locs['denominator'] = denominator[k].results

            outdict[k] = v.results.edit(**locs)
        if print_info:

            thetime = strftime("%H:%M:%S", localtime())
            print(
                "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
                % (thetime, "'\n         '".join(sorted(outdict.keys()))))
        return Interrodict(outdict)

    elif isinstance(interrogation, (DataFrame, Series)):
        dataframe1 = interrogation
    elif isinstance(interrogation, Interrogation):
        #if interrogation.__dict__.get('concordance', None) is not None:
        #    concordances = interrogation.concordance
        branch = kwargs.pop('branch', 'results')
        if branch.lower().startswith('r'):
            dataframe1 = interrogation.results
        elif branch.lower().startswith('t'):
            dataframe1 = interrogation.totals
        elif branch.lower().startswith('c'):
            dataframe1 = interrogation.concordance
            return_conc = True
        else:
            dataframe1 = interrogation.results

    elif isinstance(interrogation, Concordance) or \
                        all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']):
        return_conc = True
        print('heree')
        dataframe1 = interrogation
    # hope for the best
    else:
        dataframe1 = interrogation

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None

    try:
        from process import checkstack
    except ImportError:
        from corpkit.process import checkstack

    if checkstack('pythontex'):
        print_info = False

    def combiney(df, df2, operation='%', threshold='medium', prinf=True):
        """
        Mash df and df2 together in appropriate way
        """
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(
                        df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print(
                                'Removing %d entries below threshold:\n    %s'
                                % (len(to_drop), '\n    '.join(to_show)))
                        if len(to_drop) > 10:
                            print('... and %d more ... \n' %
                                  (len(to_drop) - len(to_show) + 1))
                        else:
                            print('')
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '+':
                try:
                    df = df.add(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '-':
                try:
                    df = df.sub(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis=0)
                except ValueError:

                    thetime = strftime("%H:%M:%S", localtime())
                    print(
                        '%s: cannot combine DataFrame 1 and 2: different shapes'
                        % thetime)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2

            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis=1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T])
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis=1).T

                def editf(datum):
                    meth = {
                        '%': datum.div,
                        '*': datum.mul,
                        '/': datum.div,
                        '+': datum.add,
                        '-': datum.sub
                    }

                    if datum.name in list(df2.columns):

                        method = meth[operation]
                        mathed = method(df2[datum.name], fill_value=0.0)
                        if operation == '%':
                            return mathed * 100.0
                        else:
                            return mathed
                    else:
                        return datum * 0.0

                df = df.apply(editf)

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis=1) / df2.T.sum()

        return df, totals

    def skip_keep_merge_span(df):
        """
        Do all skipping, keeping, merging and spanning
        """
        from corpkit.dictionaries.process_types import Wordlist
        if skip_entries:
            if isinstance(skip_entries, (list, Wordlist)):
                df = df.drop(list(skip_entries), axis=1, errors='ignore')
            else:
                df = df.loc[:, ~df.columns.str.contains(skip_entries)]
        if just_entries:
            if isinstance(just_entries, (list, Wordlist)):
                je = [i for i in list(just_entries) if i in list(df.columns)]
                df = df[je]
            else:
                df = df.loc[:, df.columns.str.contains(just_entries)]
        if merge_entries:
            for newname, crit in merge_entries.items():
                if isinstance(crit, (list, Wordlist)):
                    crit = [i for i in list(crit) if i in list(df.columns)]
                    cr = [i for i in list(crit) if i in list(df.columns)]
                    summed = df[cr].sum(axis=1)
                    df = df.drop(list(cr), axis=1, errors='ignore')
                else:
                    summed = df.loc[:,
                                    df.columns.str.contains(crit)].sum(axis=1)
                    df = df.loc[:, ~df.columns.str.contains(crit)]
                df.insert(0, newname, summed, allow_duplicates=True)
        if span_entries:
            df = df.iloc[:, span_entries[0]:span_entries[1]]
        if skip_subcorpora:
            if isinstance(skip_subcorpora, (list, Wordlist)):
                df = df.drop(list(skip_subcorpora), axis=0, errors='ignore')
            else:
                df = df[~df.index.str.contains(skip_subcorpora)]
        if just_subcorpora:
            if isinstance(just_subcorpora, (list, Wordlist)):
                js = [i for i in list(just_subcorpora) if i in list(df.index)]
                df = df.loc[js]
            else:
                df = df[df.index.str.contains(just_subcorpora)]
        if merge_subcorpora:
            df = df.T
            for newname, crit in merge_subcorpora.items():
                if isinstance(crit, (list, Wordlist)):
                    crit = [i for i in list(crit) if i in list(df.columns)]
                    summed = df[list(crit)].sum(axis=1)
                    df = df.drop(list(crit), axis=1, errors='ignore')
                else:
                    summed = df.loc[:,
                                    df.columns.str.contains(crit)].sum(axis=1)
                    df = df.loc[:, ~df.columns.str.contains(crit)]
                df.insert(0, newname, summed, allow_duplicates=True)
            df = df.T
        if span_subcorpora:
            df = df.iloc[span_subcorpora[0]:span_subcorpora[1], :]
        return df

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""
        parsed_input = False
        import re
        if the_input == 'all':
            the_input = r'.*'
        if isinstance(the_input, int):
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif isinstance(the_input, STRINGTYPE):
            regex = re.compile(the_input)
            parsed_input = [w for w in list(df) if re.search(regex, w)]
            return parsed_input
        from corpkit.dictionaries.process_types import Wordlist
        if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist:
            the_input = list(the_input)
        if isinstance(the_input, list):
            if isinstance(the_input[0], int):
                parsed_input = [
                    word for index, word in enumerate(list(df))
                    if index in the_input
                ]
            elif isinstance(the_input[0], STRINGTYPE):
                try:
                    parsed_input = [
                        word for word in the_input if word in df.columns
                    ]
                except AttributeError:  # if series
                    parsed_input = [
                        word for word in the_input if word in df.index
                    ]
        return parsed_input

    def synonymise(df, pos='n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos=pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to='US', print_info=print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print('Converting spelling ... \n')
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info=print_info):
        if print_info:
            print('Merging duplicate entries ... \n')
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis=1)
            #df = df.drop([dup for d in range(num_dupes)], axis=1)
            df = df.drop(dup, axis=1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info=print_info):
        """replace entry names and merge"""
        import re
        # get input into list of tuples
        # if it's a string, we want to delete it
        if isinstance(replace_names, STRINGTYPE):
            replace_names = [(replace_names, '')]
        # this is for some malformed list
        if not isinstance(replace_names, dict):
            if isinstance(replace_names[0], STRINGTYPE):
                replace_names = [replace_names]
        # if dict, make into list of tupes
        if isinstance(replace_names, dict):
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                if replacement:
                    print('Replacing "%s" with "%s" ...\n' %
                          (to_find, replacement))
                else:
                    print('Deleting "%s" from entry names ...\n' % to_find)
            to_find = re.compile(to_find)
            if not replacement:
                replacement = ''
            df.columns = [
                re.sub(to_find, replacement, l) for l in list(df.columns)
            ]
        df = merge_duplicates(df, print_info=False)
        return df

    def newname_getter(df,
                       parsed_input,
                       newname='combine',
                       prinf=True,
                       merging_subcorpora=False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if isinstance(newname, int):
            the_newname = list(df.columns)[newname]
        elif isinstance(newname, STRINGTYPE):
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if not newname:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(iter(sumdict.items()),
                              key=operator.itemgetter(1))[0]
        if not isinstance(the_newname, STRINGTYPE):
            the_newname = str(the_newname, errors='ignore')
        return the_newname

    def projector(df, list_of_tuples, prinf=True):
        """project abs values"""
        if isinstance(list_of_tuples, list):
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list(list_of_tuples.items()):
            if isinstance(subcorpus, int):
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if isinstance(projection_value, float):
                    print('Projection: %s * %s' %
                          (subcorpus, projection_value))
                if isinstance(projection_value, int):
                    print('Projection: %s * %d' %
                          (subcorpus, projection_value))
        if prinf:
            print('')
        return df

    def lingres(ser, index):
        from scipy.stats import linregress
        from pandas import Series
        ix = ['slope', 'intercept', 'r', 'p', 'stderr']
        return Series(linregress(index, ser.values), index=ix)

    def do_stats(df):
        """do linregress and add to df"""

        try:
            from scipy.stats import linregress
        except ImportError:
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: sort type not available in this version of corpkit.' %
                  thetime)
            return False

        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = list(range(len(indices)))

        stats = df.apply(lingres, axis=0, index=x)
        df = df.append(stats)
        df = df.replace([np.inf, -np.inf], 0.0)

        return df

    def resort(df, sort_by=False, keep_stats=False):
        """
        Sort results, potentially using scipy's linregress
        """

        # translate options and make sure they are parseable
        stat_field = ['slope', 'intercept', 'r', 'p', 'stderr']
        easy_sorts = ['total', 'infreq', 'name', 'most', 'least', 'reverse']
        stat_sorts = ['increase', 'decrease', 'static', 'turbulent']
        options = stat_field + easy_sorts + stat_sorts
        sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'}
        sort_by = sort_by_convert.get(sort_by, sort_by)

        # probably broken :(
        if just_totals:
            if sort_by == 'name':
                return df.sort_index()
            else:
                return df.sort_values(by='Combined total',
                                      ascending=sort_by != 'total',
                                      axis=1)

        stats_done = False
        if keep_stats or sort_by in stat_field + stat_sorts:
            df = do_stats(df)
            stats_done = True
            if isinstance(df, bool):
                if df is False:
                    return False

        if isinstance(df, Series):
            if stats_done:
                stats = df.ix[range(-5, 0)]
                df = df.drop(list(stats.index))
            if sort_by == 'name':
                df = df.sort_index()
            elif sort_by == 'reverse':
                df = df[::-1]
            else:
                df = df.sort_values(ascending=sort_by != 'total')
            if stats_done:
                df = df.append(stats)
            return df

        if sort_by == 'name':
            # currently case sensitive
            df = df.reindex_axis(sorted(df.columns), axis=1)
        elif sort_by in ['total', 'infreq']:
            if df1_istotals:
                df = df.T
            df = df[list(
                df.sum().sort_values(ascending=sort_by != 'total').index)]

        elif sort_by == 'reverse':
            df = df.T[::-1].T
        # sort by slope etc., or search by subcorpus name
        if sort_by in stat_field or sort_by not in options:
            asc = kwargs.get('reverse', False)
            df = df.T.sort_values(by=sort_by, ascending=asc).T

        if sort_by in ['increase', 'decrease', 'static', 'turbulent']:
            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

        # remove stats field by default
        if not keep_stats:
            df = df.drop(stat_field, axis=0, errors='ignore')
        return df

    def set_threshold(big_list, threshold, prinf=True):
        if isinstance(threshold, STRINGTYPE):
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if isinstance(big_list, DataFrame):
                tot = big_list.sum().sum()

            if isinstance(big_list, Series):
                tot = big_list.sum()
            tshld = float(tot) / float(denominator)
        else:
            tshld = threshold
        if prinf:
            print('Threshold: %d\n' % tshld)
        return tshld

    # copy dataframe to be very safe
    df = dataframe1.copy()
    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    if isinstance(interrogation, Concordance):
        return_conc = True
    # do concordance work
    if return_conc:
        if just_entries:
            if isinstance(just_entries, int):
                just_entries = [just_entries]
            if isinstance(just_entries, STRINGTYPE):
                df = df[df['m'].str.contains(just_entries)]
            if isinstance(just_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in just_entries):
                    mp = df['m'].map(lambda x: x in just_entries)
                    df = df[mp]
                else:
                    df = df.ix[just_entries]

        if skip_entries:
            if isinstance(skip_entries, int):
                skip_entries = [skip_entries]
            if isinstance(skip_entries, STRINGTYPE):
                df = df[~df['m'].str.contains(skip_entries)]
            if isinstance(skip_entries, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_entries):
                    mp = df['m'].map(lambda x: x not in skip_entries)
                    df = df[mp]
                else:
                    df = df.drop(skip_entries, axis=0)

        if just_subcorpora:
            if isinstance(just_subcorpora, int):
                just_subcorpora = [just_subcorpora]
            if isinstance(just_subcorpora, STRINGTYPE):
                df = df[df['c'].str.contains(just_subcorpora)]
            if isinstance(just_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in just_subcorpora):
                    mp = df['c'].map(lambda x: x in just_subcorpora)
                    df = df[mp]
                else:
                    df = df.ix[just_subcorpora]

        if skip_subcorpora:
            if isinstance(skip_subcorpora, int):
                skip_subcorpora = [skip_subcorpora]
            if isinstance(skip_subcorpora, STRINGTYPE):
                df = df[~df['c'].str.contains(skip_subcorpora)]
            if isinstance(skip_subcorpora, list):
                if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora):
                    mp = df['c'].map(lambda x: x not in skip_subcorpora)
                    df = df[mp]
                else:
                    df = df.drop(skip_subcorpora, axis=0)

        return Concordance(df)

    if print_info:
        print('\n***Processing results***\n========================\n')

    df1_istotals = False
    if isinstance(df, Series):
        df1_istotals = True
        df = DataFrame(df)
        # if just a single result
    else:
        df = DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T

    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    if denominator.__class__ == Interrogation:
        try:
            denominator = denominator.results
        except AttributeError:
            denominator = denominator.totals

    if denominator is not False and not isinstance(denominator, STRINGTYPE):
        df2 = denominator.copy()
        using_totals = True
        if isinstance(df2, DataFrame):
            if len(df2.columns) > 1:
                single_totals = False
            else:
                df2 = Series(df2.iloc[:, 0])
        elif isinstance(df2, Series):
            single_totals = True
            #if operation == 'k':
            #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?')
    else:
        if operation in ['k', 'a', '%', '/', '*', '-', '+']:
            denominator = 'self'
        if denominator == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to=spelling)
        df = merge_duplicates(df, print_info=False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to=spelling, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, replace_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
        if not sort_by:
            sort_by = 'total'

    if replace_subcorpus_names:
        df = name_replacer(df.T, replace_subcorpus_names)
        df = merge_duplicates(df).T
        df = df.sort_index()
        if not single_totals:
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = name_replacer(df2, replace_subcorpus_names, print_info=False)
            df2 = merge_duplicates(df2, print_info=False)
            if isinstance(df2, DataFrame):
                df2 = df2.T
            df2 = df2.sort_index()
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis=0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis=0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                        [0, 1, 0, 1]):
        if name == 'Total' and df1_istotals:
            continue
        try:
            df = df.drop(name, axis=ax, errors='ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                        [0, 1, 0, 1]):
        if name == 'Total' and single_totals:
            continue

        try:
            df2 = df2.drop(name, axis=ax, errors='ignore')
        except:
            pass

    df = skip_keep_merge_span(df)
    try:
        df2 = skip_keep_merge_span(df2)
    except:
        pass

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    if just_totals:
        df = DataFrame(df.sum(), columns=['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = DataFrame(df2.sum(), columns=['Combined total'])
            else:
                df2 = df2.sum()

    tots = df.sum(axis=1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            tshld = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    tshld = set_threshold(df2, threshold, prinf=print_info)
            df, tots = combiney(df,
                                df2,
                                operation=operation,
                                threshold=tshld,
                                prinf=print_info)

    # if doing keywording...
    if operation.startswith('k'):

        if isinstance(denominator, STRINGTYPE):
            if denominator == 'self':
                df2 = df.copy()
            else:
                df2 = denominator

        from corpkit.keys import keywords
        df = keywords(df,
                      df2,
                      selfdrop=selfdrop,
                      threshold=threshold,
                      print_info=print_info,
                      editing=True,
                      calc_all=calc_all,
                      sort_by=sort_by,
                      measure=keyword_measure,
                      **kwargs)

    # drop infinites and nans
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0.0)

    # resort data
    if sort_by or keep_stats:
        df = resort(df, keep_stats=keep_stats, sort_by=sort_by)
        if isinstance(df, bool):
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = Series(df['Combined total'], name='Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0, :]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = Series(df['Total'], name='Total')
            except:
                total = 'none'
                pass

            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis=1)
        except:
            total = 'none'

    if not isinstance(tots, DataFrame) and not isinstance(tots, Series):
        total = df.sum(axis=1)
    else:
        total = tots

    if isinstance(df, DataFrame):
        if df.empty:
            datatype = 'object'
        else:
            datatype = df.iloc[0].dtype
    else:
        datatype = df.dtype
    locs['datatype'] = datatype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        """add an order for tkintertable if using gui"""
        if isinstance(df, Series):
            df = df.T
            df = df.drop('tkintertable-order', errors='ignore', axis=0)
            df = df.drop('tkintertable-order', errors='ignore', axis=1)
            dat = [i for i in range(len(df.index))]
            df['tkintertable-order'] = Series(dat, index=list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    if checkstack('tkinter'):
        df = add_tkt_index(df)

    if kwargs.get('df1_always_df'):
        if isinstance(df, Series):
            df = DataFrame(df)

    # delete non-appearing conc lines
    lns = None
    if isinstance(getattr(interrogation, 'concordance', None), Concordance):
        try:
            col_crit = interrogation.concordance['m'].map(
                lambda x: x in list(df.columns))
            ind_crit = interrogation.concordance['c'].map(
                lambda x: x in list(df.index))
            lns = interrogation.concordance[col_crit]
            lns = lns.loc[ind_crit]
            lns = Concordance(lns)
        except ValueError:
            lns = None

    output = Interrogation(results=df,
                           totals=total,
                           query=locs,
                           concordance=lns)

    if print_info:
        print('***Done!***\n========================\n')

    return output
コード例 #6
0
def plotter(df,
            title=False,
            kind='line',
            x_label=None,
            y_label=None,
            style='ggplot',
            figsize=(8, 4),
            save=False,
            legend_pos='best',
            reverse_legend='guess',
            num_to_plot=7,
            tex='try',
            colours='default',
            cumulative=False,
            pie_legend=True,
            partial_pie=False,
            show_totals=False,
            transparent=False,
            output_format='png',
            interactive=False,
            black_and_white=False,
            show_p_val=False,
            indices=False,
            transpose=False,
            rot=False,
            **kwargs):
    """Visualise corpus interrogations.
    :param title: A title for the plot
    :type title: str
    :param df: Data to be plotted
    :type df: Pandas DataFrame
    :param x_label: A label for the x axis
    :type x_label: str
    :param y_label: A label for the y axis
    :type y_label: str
    :param kind: The kind of chart to make
    :type kind: str ('line'/'bar'/'barh'/'pie'/'area')
    :param style: Visual theme of plot
    :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc)
    :param figsize: Size of plot
    :type figsize: tuple (int, int)
    :param save: If bool, save with *title* as name; if str, use str as name
    :type save: bool/str
    :param legend_pos: Where to place legend
    :type legend_pos: str ('upper right'/'outside right'/etc)
    :param reverse_legend: Reverse the order of the legend
    :type reverse_legend: bool
    :param num_to_plot: How many columns to plot
    :type num_to_plot: int/'all'
    :param tex: Use TeX to draw plot text
    :type tex: bool
    :param colours: Colourmap for lines/bars/slices
    :type colours: str
    :param cumulative: Plot values cumulatively
    :type cumulative: bool
    :param pie_legend: Show a legend for pie chart
    :type pie_legend: bool
    :param partial_pie: Allow plotting of pie slices only
    :type partial_pie: bool
    :param show_totals: Print sums in plot where possible
    :type show_totals: str -- 'legend'/'plot'/'both'
    :param transparent: Transparent .png background
    :type transparent: bool
    :param output_format: File format for saved image
    :type output_format: str -- 'png'/'pdf'
    :param black_and_white: Create black and white line styles
    :type black_and_white: bool
    :param show_p_val: Attempt to print p values in legend if contained in df
    :type show_p_val: bool
    :param indices: To use when plotting "distance from root"
    :type indices: bool
    :param stacked: When making bar chart, stack bars on top of one another
    :type stacked: str
    :param filled: For area and bar charts, make every column sum to 100
    :type filled: str
    :param legend: Show a legend
    :type legend: bool
    :param rot: Rotate x axis ticks by *rot* degrees
    :type rot: int
    :param subplots: Plot each column separately
    :type subplots: bool
    :param layout: Grid shape to use when *subplots* is True
    :type layout: tuple -- (int, int)
    :param interactive: Experimental interactive options
    :type interactive: list -- [1, 2, 3]
    :returns: matplotlib figure
    """
    import corpkit
    import os
    try:
        from IPython.utils.shimmodule import ShimWarning
        import warnings
        warnings.simplefilter('ignore', ShimWarning)
    except:
        pass

    kwargs['rot'] = rot

    xtickspan = kwargs.pop('xtickspan', False)

    # prefer seaborn plotting
    try:
        import seaborn as sns
    except (ImportError, AttributeError):
        pass

    import matplotlib as mpl
    from matplotlib import rc

    if interactive:
        import matplotlib.pyplot as plt, mpld3
    else:
        import matplotlib.pyplot as plt

    import matplotlib.ticker as ticker

    import pandas
    from pandas import DataFrame, Series, MultiIndex

    from time import localtime, strftime
    from process import checkstack

    if interactive:
        import mpld3
        import collections
        from mpld3 import plugins, utils
        from plugins import InteractiveLegendPlugin, HighlightLines

    have_mpldc = False
    try:
        from mpldatacursor import datacursor, HighlightingDataCursor
        have_mpldc = True
    except ImportError:
        pass

    # if the data was multiindexed, the default is a little different!
    from corpkit.interrogation import Interrogation
    if isinstance(df.index, MultiIndex):
        import matplotlib.pyplot as nplt
        shape = kwargs.get('shape', 'auto')
        truncate = kwargs.get('truncate', 8)
        if shape == 'auto':
            shape = (int(len(df.index.levels[0]) / 2), 2)
        f, axes = nplt.subplots(*shape)
        for i, ((name, data),
                ax) in enumerate(zip(df.groupby(level=0), axes.flatten())):
            data = data.loc[name]
            if isinstance(truncate, int) and i > truncate:
                continue
            if kwargs.get('name_format'):
                name = kwargs.get('name_format').format(name)
            data = Interrogation(results=data,
                                 totals=data.sum(axis=1),
                                 query=None)
            data.visualise(title=name,
                           ax=ax,
                           kind=kind,
                           x_label=x_label,
                           y_label=y_label,
                           style=style,
                           figsize=figsize,
                           save=save,
                           legend_pos=legend_pos,
                           reverse_legend=reverse_legend,
                           num_to_plot=num_to_plot,
                           tex=tex,
                           colours=colours,
                           cumulative=cumulative,
                           pie_legend=pie_legend,
                           partial_pie=partial_pie,
                           show_totals=show_totals,
                           transparent=transparent,
                           output_format=output_format,
                           interactive=interactive,
                           black_and_white=black_and_white,
                           show_p_val=show_p_val,
                           indices=indices,
                           transpose=transpose,
                           rot=rot)
        return nplt

    def copy(self):
        from corpkit.interrogation import Interrodict
        copied = {}
        for k, v in self.items():
            copied[k] = v
        return Interrodict(copied)

    # check what environment we're in
    tk = checkstack('tkinter')
    running_python_tex = checkstack('pythontex')
    running_spider = checkstack('spyder')

    if not title:
        title = ''

    def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
        """remove extreme values from colourmap --- no pure white"""
        import matplotlib.colors as colors
        import numpy as np
        new_cmap = colors.LinearSegmentedColormap.from_list(
            'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name,
                                                a=minval,
                                                b=maxval),
            cmap(np.linspace(minval, maxval, n)))
        return new_cmap

    def get_savename(imagefolder, save=False, title=False, ext='png'):
        """Come up with the savename for the image."""
        import os
        from corpkit.process import urlify

        # name as
        if not ext.startswith('.'):
            ext = '.' + ext
        if isinstance(save, STRINGTYPE):
            savename = os.path.join(imagefolder, (urlify(save) + ext))
        #this 'else' is redundant now that title is obligatory
        else:
            if title:
                filename = urlify(title) + ext
                savename = os.path.join(imagefolder, filename)

        # remove duplicated ext
        if savename.endswith('%s%s' % (ext, ext)):
            savename = savename.replace('%s%s' % (ext, ext), ext, 1)
        return savename

    def rename_data_with_total(dataframe,
                               was_series=False,
                               using_tex=False,
                               absolutes=True):
        """adds totals (abs, rel, keyness) to entry name strings"""
        if was_series:
            where_the_words_are = dataframe.index
        else:
            where_the_words_are = dataframe.columns
        the_labs = []
        for w in list(where_the_words_are):
            if not absolutes:
                if was_series:
                    perc = dataframe.T[w][0]
                else:
                    the_labs.append(w)
                    continue
                if using_tex:
                    the_labs.append('%s (%.2f\%%)' % (w, perc))
                else:
                    the_labs.append('%s (%.2f %%)' % (w, perc))
            else:
                if was_series:
                    score = dataframe.T[w].sum()
                else:
                    score = dataframe[w].sum()
                if using_tex:
                    the_labs.append('%s (n=%d)' % (w, score))
                else:
                    the_labs.append('%s (n=%d)' % (w, score))
        if not was_series:
            dataframe.columns = the_labs
        else:
            vals = list(dataframe[list(dataframe.columns)[0]].values)
            dataframe = pandas.DataFrame(vals, index=the_labs)
            dataframe.columns = ['Total']
        return dataframe

    def auto_explode(dataframe, tinput, was_series=False, num_to_plot=7):
        """give me a list of strings and i'll output explode option"""
        output = [0 for s in range(num_to_plot)]

        if was_series:
            l = list(dataframe.index)
        else:
            l = list(dataframe.columns)

        if isinstance(tinput, (STRINGTYPE, int)):
            tinput = [tinput]
        if isinstance(tinput, list):
            for i in tinput:
                if isinstance(i, STRINGTYPE):
                    index = l.index(i)
                else:
                    index = i
                output[index] = 0.1
        return output

    # get a few options from kwargs
    sbplt = kwargs.get('subplots', False)
    show_grid = kwargs.pop('grid', True)
    the_rotation = kwargs.get('rot', False)
    dragmode = kwargs.pop('draggable', False)
    leg_frame = kwargs.pop('legend_frame', True)
    leg_alpha = kwargs.pop('legend_alpha', 0.8)
    # auto set num to plot based on layout
    lo = kwargs.get('layout', None)
    if lo:
        num_to_plot = lo[0] * lo[1]

    # todo: get this dynamically instead.
    styles = [
        'dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight',
        'matplotlib', False, 'mpl-white'
    ]
    #if style not in styles:
    #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles)))

    if style == 'mpl-white':
        try:
            sns.set_style("whitegrid")
        except:
            pass
        style = 'matplotlib'

    if kwargs.get('savepath'):
        mpl.rcParams['savefig.directory'] = kwargs.get('savepath')
        kwargs.pop('savepath', None)

    mpl.rcParams['savefig.bbox'] = 'tight'
    mpl.rcParams.update({'figure.autolayout': True})

    # try to use tex
    # TO DO:
    # make some font kwargs here
    using_tex = False
    mpl.rcParams['font.family'] = 'sans-serif'
    mpl.rcParams['text.latex.unicode'] = True

    if tex == 'try' or tex is True:
        try:
            rc('text', usetex=True)
            rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
            using_tex = True
        except:
            matplotlib.rc('font', family='sans-serif')
            matplotlib.rc('font', serif='Helvetica Neue')
            matplotlib.rc('text', usetex='false')
            rc('text', usetex=False)
    else:
        rc('text', usetex=False)

    if interactive:
        using_tex = False

    if show_totals is False:
        show_totals = 'none'

    # find out what kind of plot we're making, and enable
    # or disable interactive values if need be
    kwargs['kind'] = kind.lower()

    if interactive:
        if kwargs['kind'].startswith('bar'):
            interactive_types = [3]
        elif kwargs['kind'] == 'area':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'line':
            interactive_types = [2, 3]
        elif kwargs['kind'] == 'pie':
            interactive_types = None
            warnings.warn(
                'Interactive plotting not yet available for pie plots.')
        else:
            interactive_types = [None]
    if interactive is False:
        interactive_types = [None]

    # find out if pie mode, add autopct format
    piemode = False
    if kind == 'pie':
        piemode = True
        # always the best spot for pie
        #if legend_pos == 'best':
        #legend_pos = 'lower left'
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            kwargs['pctdistance'] = 0.6
            if using_tex:
                kwargs['autopct'] = r'%1.1f\%%'
            else:
                kwargs['autopct'] = '%1.1f%%'

    # copy data, make series into df
    dataframe = df.copy()
    if kind == 'heatmap':
        try:
            dataframe = dataframe.T
        except:
            pass
    was_series = False
    if isinstance(dataframe, Series):
        was_series = True
        if not cumulative:
            dataframe = DataFrame(dataframe)
        else:
            dataframe = DataFrame(dataframe.cumsum())
    else:
        # don't know if this is much good.
        if transpose:
            dataframe = dataframe.T
        if cumulative:
            dataframe = DataFrame(dataframe.cumsum())
        if len(list(dataframe.columns)) == 1:
            was_series = True

    # attempt to convert x axis to ints:
    #try:
    #    dataframe.index = [int(i) for i in list(dataframe.index)]
    #except:
    #    pass

    # remove totals and tkinter order
    if not was_series:
        for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2,
                            [0, 1, 0, 1]):
            try:
                dataframe = dataframe.drop(name, axis=ax, errors='ignore')
            except:
                pass

    try:
        dataframe = dataframe.drop('tkintertable-order', errors='ignore')
    except:
        pass
    try:
        dataframe = dataframe.drop('tkintertable-order',
                                   axis=1,
                                   errors='ignore')
    except:
        pass

    # look at columns to see if all can be ints, in which case, set up figure
    # for depnumming
    if not was_series:
        if indices == 'guess':

            def isint(x):
                try:
                    a = float(x)
                    b = int(a)
                except (ValueError, OverflowError):
                    return False
                else:
                    return a == b

            if all([isint(x) is True for x in list(dataframe.columns)]):
                indices = True
            else:
                indices = False

        # if depnumming, plot all, transpose, and rename axes
        if indices is True:
            num_to_plot = 'all'
            dataframe = dataframe.T
            if y_label is None:
                y_label = 'Percentage of all matches'
            if x_label is None:
                x_label = ''

    # set backend?
    output_formats = [
        'svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf'
    ]
    if output_format not in output_formats:
        raise ValueError('%s output format not recognised. Must be: %s' %
                         (output_format, ', '.join(output_formats)))

    # don't know if these are necessary
    if 'pdf' in output_format:
        plt.switch_backend(output_format)
    if 'pgf' in output_format:
        plt.switch_backend(output_format)

    if num_to_plot == 'all':
        if was_series:
            if not piemode:
                num_to_plot = len(dataframe)
            else:
                num_to_plot = len(dataframe)
        else:
            if not piemode:
                num_to_plot = len(list(dataframe.columns))
            else:
                num_to_plot = len(dataframe.index)

    # explode pie, or remove if not piemode
    if piemode and not sbplt and kwargs.get('explode'):
        kwargs['explode'] = auto_explode(dataframe,
                                         kwargs['explode'],
                                         was_series=was_series,
                                         num_to_plot=num_to_plot)
    else:
        kwargs.pop('explode', None)

    legend = kwargs.get('legend', True)

    #cut data short
    plotting_a_totals_column = False
    if was_series:
        if list(dataframe.columns)[0] != 'Total':
            try:
                can_be_ints = [int(x) for x in list(dataframe.index)]
                num_to_plot = len(dataframe)
            except:
                dataframe = dataframe[:num_to_plot]
        elif list(dataframe.columns)[0] == 'Total':
            plotting_a_totals_column = True
            if not 'legend' in kwargs:
                legend = False
            num_to_plot = len(dataframe)
    else:
        if transpose:
            dataframe = dataframe.head(num_to_plot)
        else:
            dataframe = dataframe.T.head(num_to_plot).T

    # remove stats fields, put p in entry text, etc.
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        dataframe = dataframe.drop(statfields, axis=1, errors='ignore')
    except:
        pass
    try:
        dataframe.ix['p']
        there_are_p_vals = True
    except:
        there_are_p_vals = False
    if show_p_val:
        if there_are_p_vals:
            newnames = []
            for col in list(dataframe.columns):
                pval = dataframe[col]['p']

                def p_string_formatter(val):
                    if val < 0.001:
                        if not using_tex:
                            return 'p < 0.001'
                        else:
                            return r'p $<$ 0.001'
                    else:
                        return 'p = %s' % format(val, '.3f')

                pstr = p_string_formatter(pval)
                newname = '%s (%s)' % (col, pstr)
                newnames.append(newname)
            dataframe.columns = newnames
            dataframe.drop(statfields, axis=0, inplace=True, errors='ignore')
        else:
            warnings.warn(
                'No p-values calculated to show.\n\nUse keep_stats kwarg while editing to generate these values.'
            )
    else:
        if there_are_p_vals:
            dataframe.drop(statfields, axis=0, inplace=True, errors='ignore')

    # make and set y label
    absolutes = True
    if isinstance(dataframe, DataFrame):
        try:
            if not all([s.is_integer() for s in dataframe.iloc[0, :].values]):
                absolutes = False
        except:
            pass
    else:
        if not all([s.is_integer() for s in dataframe.values]):
            absolutes = False

    ##########################################
    ################ COLOURS #################
    ##########################################

    # set defaults, with nothing for heatmap yet
    if colours is True or colours == 'default' or colours == 'Default':
        if kind != 'heatmap':
            colours = 'viridis'
        else:
            colours = 'default'

    # assume it's a single color, unless string denoting map
    cmap_or_c = 'color'
    if isinstance(colours, str):
        cmap_or_c = 'colormap'
    from matplotlib.colors import LinearSegmentedColormap
    if isinstance(colours, LinearSegmentedColormap):
        cmap_or_c = 'colormap'

    # for heatmaps, it's always a colormap
    if kind == 'heatmap':
        cmap_or_c = 'cmap'
        # if it's a defaulty string, set accordingly
        if isinstance(colours, str):
            if colours.lower().startswith('diverg'):
                colours = sns.diverging_palette(10, 133, as_cmap=True)

            # if default not set, do diverge for any df with a number < 0
            elif colours.lower() == 'default':
                mn = dataframe.min()
                if isinstance(mn, Series):
                    mn = mn.min()
                if mn < 0:
                    colours = sns.diverging_palette(10, 133, as_cmap=True)
                else:
                    colours = sns.light_palette("green", as_cmap=True)

    if 'seaborn' not in style:
        kwargs[cmap_or_c] = colours
    #if not was_series:
    #    if kind in ['pie', 'line', 'area']:
    #        if colours and not plotting_a_totals_column:
    #            kwargs[cmap_or_c] = colours
    #    else:
    #        if colours:
    #            kwargs[cmap_or_c] = colours
    #if piemode:
    #    if num_to_plot > 0:
    #        kwargs[cmap_or_c] = colours
    #    else:
    #        if num_to_plot > 0:
    #            kwargs[cmap_or_c] = colours

    # multicoloured bar charts
    #if colours and cmap_or_c == 'colormap':
    #    if kind.startswith('bar'):
    #        if len(list(dataframe.columns)) == 1:
    #            if not black_and_white:
    #                import numpy as np
    #                the_range = np.linspace(0, 1, num_to_plot)
    #                middle = len(the_range) / 2
    #                try:
    #                    cmap = plt.get_cmap(colours)
    #                    kwargs[cmap_or_c] = [cmap(n) for n in the_range][middle]
    #                except ValueError:
    #                    kwargs[cmap_or_c] = colours
    #            # make a bar width ... ? ...
    #            #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5

    # reversing legend option
    if reverse_legend is True:
        rev_leg = True
    elif reverse_legend is False:
        rev_leg = False

    # show legend or don't, guess whether to reverse based on kind
    if kind in ['bar', 'barh', 'area', 'line', 'pie']:
        if was_series:
            legend = False
        if kind == 'pie':
            if pie_legend:
                legend = True
            else:
                legend = False
    if kind in ['barh', 'area']:
        if reverse_legend == 'guess':
            rev_leg = True
    if not 'rev_leg' in locals():
        rev_leg = False

    # the default legend placement
    if legend_pos is True:
        legend_pos = 'best'

    # cut dataframe if just_totals
    try:
        tst = dataframe['Combined total']
        dataframe = dataframe.head(num_to_plot)
    except:
        pass

    # no title for subplots because ugly,
    if title and not sbplt:
        kwargs['title'] = title

    # no interactive subplots yet:
    if sbplt and interactive:
        import warnings
        interactive = False
        warnings.warn('No interactive subplots yet, sorry.')
        return

    # not using pandas for labels or legend anymore.
    #kwargs['labels'] = None
    #kwargs['legend'] = False

    if legend:
        if num_to_plot > 6:
            if not kwargs.get('ncol'):
                kwargs['ncol'] = num_to_plot // 7
        # kwarg options go in leg_options
        leg_options = {
            'framealpha': leg_alpha,
            'shadow': kwargs.get('shadow', False),
            'ncol': kwargs.pop('ncol', 1)
        }

        # determine legend position based on this dict
        if legend_pos:
            possible = {
                'best': 0,
                'upper right': 1,
                'upper left': 2,
                'lower left': 3,
                'lower right': 4,
                'right': 5,
                'center left': 6,
                'center right': 7,
                'lower center': 8,
                'upper center': 9,
                'center': 10,
                'o r': 2,
                'outside right': 2,
                'outside upper right': 2,
                'outside center right': 'center left',
                'outside lower right': 'lower left'
            }

            if isinstance(legend_pos, int):
                the_loc = legend_pos
            elif isinstance(legend_pos, str):
                try:
                    the_loc = possible[legend_pos]
                except KeyError:
                    raise KeyError(
                        'legend_pos value must be one of:\n%s\n or an int between 0-10.'
                        % ', '.join(list(possible.keys())))
            leg_options['loc'] = the_loc
            #weirdness needed for outside plot
            if legend_pos in ['o r', 'outside right', 'outside upper right']:
                leg_options['bbox_to_anchor'] = (1.02, 1)
            if legend_pos == 'outside center right':
                leg_options['bbox_to_anchor'] = (1.02, 0.5)
            if legend_pos == 'outside lower right':
                leg_options['loc'] == 'upper right'
                leg_options['bbox_to_anchor'] = (0.5, 0.5)

        # a bit of distance between legend and plot for outside legends
        if isinstance(legend_pos, str):
            if legend_pos.startswith('o'):
                leg_options['borderaxespad'] = 1

    if not piemode:
        if show_totals.endswith('both') or show_totals.endswith('legend'):
            dataframe = rename_data_with_total(dataframe,
                                               was_series=was_series,
                                               using_tex=using_tex,
                                               absolutes=absolutes)
    else:
        if pie_legend:
            if show_totals.endswith('both') or show_totals.endswith('legend'):
                dataframe = rename_data_with_total(dataframe,
                                                   was_series=was_series,
                                                   using_tex=using_tex,
                                                   absolutes=absolutes)

    if piemode:
        if partial_pie:
            dataframe = dataframe / 100.0

    # some pie things
    if piemode:
        if not sbplt:
            kwargs['y'] = list(dataframe.columns)[0]

    def filler(df):
        pby = df.T.copy()
        for i in list(pby.columns):
            tot = pby[i].sum()
            pby[i] = pby[i] * 100.0 / tot
        return pby.T

    areamode = False
    if kind == 'area':
        areamode = True

    if legend is False:
        kwargs['legend'] = False

    # line highlighting option for interactive!
    if interactive:
        if 2 in interactive_types:
            if kind == 'line':
                kwargs['marker'] = ','
        if not piemode:
            kwargs['alpha'] = 0.1

    # convert dates --- works only in my current case!
    #if plotting_a_totals_column or not was_series:
    #    try:
    #        can_it_be_int = int(list(dataframe.index)[0])
    #        can_be_int = True
    #    except:
    #        can_be_int = False
    #    if can_be_int:
    #        if 1500 < int(list(dataframe.index)[0]):
    #            if 2050 > int(list(dataframe.index)[0]):
    #                n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A')
    #                dataframe = dataframe.set_index(n)

    if kwargs.get('filled'):
        if areamode or kind.startswith('bar'):
            dataframe = filler(dataframe)
        kwargs.pop('filled', None)

    MARKERSIZE = 4
    COLORMAP = {
        0: {
            'marker': None,
            'dash': (None, None)
        },
        1: {
            'marker': None,
            'dash': [5, 5]
        },
        2: {
            'marker': "o",
            'dash': (None, None)
        },
        3: {
            'marker': None,
            'dash': [1, 3]
        },
        4: {
            'marker': "s",
            'dash': [5, 2, 5, 2, 5, 10]
        },
        5: {
            'marker': None,
            'dash': [5, 3, 1, 2, 1, 10]
        },
        6: {
            'marker': 'o',
            'dash': (None, None)
        },
        7: {
            'marker': None,
            'dash': [5, 3, 1, 3]
        },
        8: {
            'marker': "1",
            'dash': [1, 3]
        },
        9: {
            'marker': "*",
            'dash': [5, 5]
        },
        10: {
            'marker': "2",
            'dash': [5, 2, 5, 2, 5, 10]
        },
        11: {
            'marker': "s",
            'dash': (None, None)
        }
    }

    HATCHES = {
        0: {
            'color': '#dfdfdf',
            'hatch': "/"
        },
        1: {
            'color': '#6f6f6f',
            'hatch': "\\"
        },
        2: {
            'color': 'b',
            'hatch': "|"
        },
        3: {
            'color': '#dfdfdf',
            'hatch': "-"
        },
        4: {
            'color': '#6f6f6f',
            'hatch': "+"
        },
        5: {
            'color': 'b',
            'hatch': "x"
        }
    }

    if black_and_white:
        if kind == 'line':
            kwargs['linewidth'] = 1

        cmap = plt.get_cmap('Greys')
        new_cmap = truncate_colormap(cmap, 0.25, 0.95)
        if kind == 'bar':
            # darker if just one entry
            if len(dataframe.columns) == 1:
                new_cmap = truncate_colormap(cmap, 0.70, 0.90)
        kwargs[cmap_or_c] = new_cmap

    # remove things from kwargs if heatmap
    if kind == 'heatmap':
        hmargs = {
            'annot': kwargs.pop('annot', True),
            cmap_or_c: kwargs.pop(cmap_or_c, None),
            'fmt': kwargs.pop('fmt', ".2f"),
            'cbar': kwargs.pop('cbar', False)
        }

        for i in [
                'vmin', 'vmax', 'linewidths', 'linecolor', 'robust', 'center',
                'cbar_kws', 'cbar_ax', 'square', 'mask', 'norm'
        ]:
            if i in kwargs.keys():
                hmargs[i] = kwargs.pop(i, None)

    class dummy_context_mgr():
        """a fake context for plotting without style
        perhaps made obsolete by 'classic' style in new mpl"""
        def __enter__(self):
            return None

        def __exit__(self, one, two, three):
            return False

    with plt.style.context(
        (style)) if style != 'matplotlib' else dummy_context_mgr():

        kwargs.pop('filled', None)

        if not sbplt:
            # check if negative values, no stacked if so
            if areamode:
                if not kwargs.get('ax'):
                    kwargs['legend'] = False
                if dataframe.applymap(lambda x: x < 0.0).any().any():
                    kwargs['stacked'] = False
                    rev_leg = False
            if kind != 'heatmap':
                # turn off pie labels at the last minute
                if kind == 'pie' and pie_legend:
                    kwargs['labels'] = None
                    kwargs['autopct'] = '%.2f'
                if kind == 'pie':
                    kwargs.pop('color', None)
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                fg = plt.figure(figsize=figsize)
                if title:
                    plt.title(title)
                ax = kwargs.get('ax', plt.axes())
                tmp = sns.heatmap(dataframe, ax=ax, **hmargs)
                ax.set_title(title)
                for item in tmp.get_yticklabels():
                    item.set_rotation(0)
                plt.close(fg)

            if areamode and not kwargs.get('ax'):
                handles, labels = plt.gca().get_legend_handles_labels()
                del handles
                del labels

            if x_label:
                ax.set_xlabel(x_label)
            if y_label:
                ax.set_ylabel(y_label)

        else:
            if not kwargs.get('layout'):
                plt.gcf().set_tight_layout(False)

            if kind != 'heatmap':
                ax = dataframe.plot(figsize=figsize, **kwargs)
            else:
                plt.figure(figsize=figsize)
                if title:
                    plt.title(title)
                ax = plt.axes()
                sns.heatmap(dataframe, ax=ax, **hmargs)
                plt.xticks(rotation=0)
                plt.yticks(rotation=0)

        def rotate_degrees(rotation, labels):
            if rotation is None:
                if max(labels, key=len) > 6:
                    return 45
                else:
                    return 0
            elif rotation is False:
                return 0
            elif rotation is True:
                return 45
            else:
                return rotation

        if sbplt:
            if 'layout' not in kwargs:
                axes = [l for l in ax]
            else:
                axes = []
                cols = [l for l in ax]
                for col in cols:
                    for bit in col:
                        axes.append(bit)
            for index, a in enumerate(axes):
                if xtickspan is not False:
                    a.xaxis.set_major_locator(
                        ticker.MultipleLocator(xtickspan))
                labels = [item.get_text() for item in a.get_xticklabels()]
                rotation = rotate_degrees(the_rotation, labels)
                try:
                    if the_rotation == 0:
                        ax.set_xticklabels(labels,
                                           rotation=rotation,
                                           ha='center')
                    else:
                        ax.set_xticklabels(labels,
                                           rotation=rotation,
                                           ha='right')
                except AttributeError:
                    pass
        else:
            if kind == 'heatmap':
                labels = [item.get_text() for item in ax.get_xticklabels()]
                rotation = rotate_degrees(the_rotation, labels)
                if the_rotation == 0:
                    ax.set_xticklabels(labels, rotation=rotation, ha='center')
                else:
                    ax.set_xticklabels(labels, rotation=rotation, ha='right')

        if transparent:
            plt.gcf().patch.set_facecolor('white')
            plt.gcf().patch.set_alpha(0)

        if black_and_white:
            if kind == 'line':
                # white background
                # change everything to black and white with interesting dashes and markers
                c = 0
                for line in ax.get_lines():
                    line.set_color('black')
                    #line.set_width(1)
                    line.set_dashes(COLORMAP[c]['dash'])
                    line.set_marker(COLORMAP[c]['marker'])
                    line.set_markersize(MARKERSIZE)
                    c += 1
                    if c == len(list(COLORMAP.keys())):
                        c = 0

        # draw legend with proper placement etc
        if legend:
            if not piemode and not sbplt and kind != 'heatmap':
                if 3 not in interactive_types:
                    handles, labels = plt.gca().get_legend_handles_labels()
                    # area doubles the handles and labels. this removes half:
                    #if areamode:
                    #    handles = handles[-len(handles) / 2:]
                    #    labels = labels[-len(labels) / 2:]
                    if rev_leg:
                        handles = handles[::-1]
                        labels = labels[::-1]
                    if kwargs.get('ax'):
                        lgd = plt.gca().legend(handles, labels, **leg_options)
                        ax.get_legend().draw_frame(leg_frame)
                    else:
                        lgd = plt.legend(handles, labels, **leg_options)
                        lgd.draw_frame(leg_frame)

    if interactive:
        # 1 = highlight lines
        # 2 = line labels
        # 3 = legend switches
        ax = plt.gca()
        # fails for piemode
        lines = ax.lines
        handles, labels = plt.gca().get_legend_handles_labels()
        if 1 in interactive_types:
            plugins.connect(plt.gcf(), HighlightLines(lines))

        if 3 in interactive_types:
            plugins.connect(
                plt.gcf(),
                InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0))

        for i, l in enumerate(lines):
            y_vals = l.get_ydata()
            x_vals = l.get_xdata()
            x_vals = [str(x) for x in x_vals]
            if absolutes:
                ls = [
                    '%s (%s: %d)' % (labels[i], x_val, y_val)
                    for x_val, y_val in zip(x_vals, y_vals)
                ]
            else:
                ls = [
                    '%s (%s: %.2f%%)' % (labels[i], x_val, y_val)
                    for x_val, y_val in zip(x_vals, y_vals)
                ]
            if 2 in interactive_types:
                #if 'kind' in kwargs and kwargs['kind'] == 'area':
                tooltip_line = mpld3.plugins.LineLabelTooltip(
                    lines[i], labels[i])
                mpld3.plugins.connect(plt.gcf(), tooltip_line)
                #else:
                if kind == 'line':
                    tooltip_point = mpld3.plugins.PointLabelTooltip(l,
                                                                    labels=ls)
                    mpld3.plugins.connect(plt.gcf(), tooltip_point)

    if piemode:
        if not sbplt:
            plt.axis('equal')
            ax.get_xaxis().set_visible(False)
            ax.get_yaxis().set_visible(False)

    # add x label
    # this could be revised now!
    # if time series period, it's year for now
    if isinstance(dataframe.index, pandas.tseries.period.PeriodIndex):
        x_label = 'Year'

    y_l = False
    if not absolutes:
        y_l = 'Percentage'
    else:
        y_l = 'Absolute frequency'

    # hacky: turn legend into subplot titles :)
    if sbplt:
        # title the big plot
        #plt.gca().suptitle(title, fontsize = 16)
        #plt.subplots_adjust(top=0.9)
        # get all axes
        if 'layout' not in kwargs:
            axes = [l for index, l in enumerate(ax)]
        else:
            axes = []
            cols = [l for index, l in enumerate(ax)]
            for col in cols:
                for bit in col:
                    axes.append(bit)

        # set subplot titles
        for index, a in enumerate(axes):
            try:
                titletext = list(dataframe.columns)[index]
            except:
                pass
            a.set_title(titletext)
            try:
                a.legend_.remove()
            except:
                pass
            #try:
            #    from matplotlib.ticker import MaxNLocator
            #    from corpkit.process import is_number
            #    indx = list(dataframe.index)
            #    if all([is_number(qq) for qq in indx]):
            #        ax.get_xaxis().set_major_locator(MaxNLocator(integer=True))
            #except:
            #    pass
            # remove axis labels for pie plots
            if piemode:
                a.axes.get_xaxis().set_visible(False)
                a.axes.get_yaxis().set_visible(False)
                a.axis('equal')

            a.grid(b=show_grid)

    # add sums to bar graphs and pie graphs
    # doubled right now, no matter

    if not sbplt:

        # show grid
        ax.grid(b=show_grid)

        if kind.startswith('bar'):
            width = ax.containers[0][0].get_width()

    if was_series:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            # make plot a bit higher if putting these totals on it
            plt.ylim([0, the_y_limit * 1.05])
            for i, label in enumerate(list(dataframe.index)):
                if len(dataframe.ix[label]) == 1:
                    score = dataframe.ix[label][0]
                else:
                    if absolutes:
                        score = dataframe.ix[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score),
                                 ha='center',
                                 va='bottom')
                else:
                    plt.annotate(score, (i, score), ha='center', va='bottom')
    else:
        the_y_limit = plt.ylim()[1]
        if show_totals.endswith('plot') or show_totals.endswith('both'):
            for i, label in enumerate(list(dataframe.columns)):
                if len(dataframe[label]) == 1:
                    score = dataframe[label][0]
                else:
                    if absolutes:
                        score = dataframe[label].sum()
                    else:
                        #import warnings
                        #warnings.warn("It's not possible to determine total percentage from individual percentages.")
                        continue
                if not absolutes:
                    plt.annotate('%.2f' % score, (i, score),
                                 ha='center',
                                 va='bottom')
                else:
                    plt.annotate(score, (i, score), ha='center', va='bottom')

    if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'):
        plt.tight_layout()
    if kwargs.get('ax'):
        try:
            plt.gcf().set_tight_layout(False)
        except:
            pass
        try:
            plt.set_tight_layout(False)
        except:
            pass

    if save:
        if running_python_tex:
            imagefolder = '../images'
        else:
            imagefolder = 'images'

        savename = get_savename(imagefolder,
                                save=save,
                                title=title,
                                ext=output_format)

        if not os.path.isdir(imagefolder):
            os.makedirs(imagefolder)

        # save image and get on with our lives
        if legend_pos.startswith('o') and not sbplt:
            plt.gcf().savefig(savename,
                              dpi=150,
                              bbox_extra_artists=(lgd, ),
                              bbox_inches='tight',
                              format=output_format)
        else:
            plt.gcf().savefig(savename, dpi=150, format=output_format)
        time = strftime("%H:%M:%S", localtime())
        if os.path.isfile(savename):
            print('\n' + time + ": " + savename + " created.")
        else:
            raise ValueError("Error making %s." % savename)

    if dragmode:
        plt.legend().draggable()

    if sbplt:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)

    # add DataCursor to notebook backend if possible
    if have_mpldc:
        if kind == 'line':
            HighlightingDataCursor(
                plt.gca().get_lines(),
                highlight_width=4,
                highlight_color=False,
                formatter=lambda **kwargs: '%s: %s' %
                (kwargs['label'], "{0:.3f}".format(kwargs['y'])))
        else:
            datacursor(formatter=lambda **kwargs: '%s: %s' %
                       (kwargs['label'], "{0:.3f}".format(kwargs['height'])))

    #if not interactive and not running_python_tex and not running_spider \
    #    and not tk:
    #    plt.gcf().show()
    #    return plt
    #elif running_spider or tk:
    #    return plt

    if interactive:
        plt.subplots_adjust(right=.8)
        plt.subplots_adjust(left=.1)
        try:
            ax.legend_.remove()
        except:
            pass
        return mpld3.display()
    else:
        return plt
コード例 #7
0
ファイル: interrogator.py プロジェクト: kareem180/corpkit
def interrogator(
    corpus,
    search,
    query="any",
    show="w",
    exclude=False,
    excludemode="any",
    searchmode="all",
    dep_type="collapsed-ccprocessed-dependencies",
    case_sensitive=False,
    quicksave=False,
    just_speakers=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    conc=False,
    only_unique=False,
    random=False,
    only_format_match=False,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r"[A-Za-z0-9:_]",
    gramsize=2,
    split_contractions=False,
    **kwargs
):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""
    # store kwargs
    locs = locals()

    from corpkit.interrogation import Interrogation
    from corpkit.process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from corpkit.other import as_regex
    from corpkit.process import get_deps
    from time import localtime, strftime

    thetime = strftime("%H:%M:%S", localtime())
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator
    from corpkit.dictionaries.word_transforms import wordlist, taglemma

    # find out if using gui
    root = kwargs.get("root")
    note = kwargs.get("note")

    # convert path to corpus object
    if type(corpus) == str:
        from corpkit.corpus import Corpus

        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from corpkit.process import searchfixer

    search, search_iterable = searchfixer(search, query)

    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(search.keys()) == 1:
        query = search.values()[0]

    if "l" in show and search.get("t"):
        from nltk.stem.wordnet import WordNetLemmatizer

        lmtzr = WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict

        if hasattr(corpus, "__iter__"):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != search.values()[0] or len(search.keys()) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == "each":
                im = True
                just_speakers = ["each"]
            if just_speakers == ["each"]:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in search.values()):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        import os
        from corpkit.process import tregex_engine

        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None])
        to_write.encode("utf-8", errors="ignore")
        with open(to_open, "w") as fo:
            fo.write(to_write)
        q = search.values()[0]
        res = tregex_engine(
            query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True
        )
        if root:
            root.update()
        os.remove(to_open)
        if countmode:
            return len(res)
        else:
            return res

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter

        statsmode_results = Counter()
        # first, put the relevant trees into temp file
        if kwargs.get("outname"):
            to_open = "tmp-%s.txt" % kwargs["outname"]
        else:
            to_open = "tmp.txt"
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results["Sentences"] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode("utf-8", errors="ignore") + "\n"
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith("pass")])
                statsmode_results["Passives"] += numpass
                statsmode_results["Tokens"] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results["Words"] += len(words)
                statsmode_results["Characters"] += len("".join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from corpkit.other import as_regex

        tregex_qs = {
            "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/",
            "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)",
            "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))",
            "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))",
            "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))",
            "Open class words": r"/^(NN|JJ|VB|RB)/ < __",
            "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/",
            "Clauses": r"/^S/ < __",
            "Interrogative": r"ROOT << (/\?/ !< __)",
            "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"),
            "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"),
            "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)"
            % as_regex(processes.relational, boundaries="w"),
        }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + "/" + str(total_files)
                if kwargs.get("outname"):
                    tot_string = "%s: %s" % (kwargs["outname"], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get("note", False):
                kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False):
        if speakr is False:
            speakr = ""
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if "-join-".join([f, whole, mid]) not in duplicates:
                duplicates.append("-join-".join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)]
            for offstart, offend in offsets:
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict

        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith("u"):
                if word.lower() in taglemma.keys():
                    word = taglemma[word.lower()]
                else:
                    if word == "x":
                        word = "Other"
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag=False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False}

        if lemmatag is False:
            tag = "n"  # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)")
            tagchecker = re.compile(r"^[A-Z]{1,4}$")
            qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "")
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], "n")
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results):
        """format tregex by show list"""
        if countmode:
            return results
        import re

        done = []
        if "l" in show or "pl" in show:
            lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get("w"):
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("w"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("l"), lemma):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("p"), word):
                        continue
                if len(exclude.keys()) == 1 or excludemode == "any":
                    if re.search(exclude.get("pl"), lemma):
                        continue
            if exclude and excludemode == "all":
                num_to_cause_exclude = len(exclude.keys())
                current_num = 0
                if exclude.get("w"):
                    if re.search(exclude.get("w"), word):
                        current_num += 1
                if exclude.get("l"):
                    if re.search(exclude.get("l"), lemma):
                        current_num += 1
                if exclude.get("p"):
                    if re.search(exclude.get("p"), word):
                        current_num += 1
                if exclude.get("pl"):
                    if re.search(exclude.get("pl"), lemma):
                        current_num += 1
                if current_num == num_to_cause_exclude:
                    continue

            for i in show:
                if i == "t":
                    bits.append(word)
                if i == "l":
                    bits.append(lemma)
                elif i == "w":
                    bits.append(word)
                elif i == "p":
                    bits.append(word)
                elif i == "pl":
                    bits.append(lemma)
            joined = "/".join(bits)
            done.append(joined)
        return done

    def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = "".join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True):
        from collections import Counter
        import re

        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == "any":
            pattern = r".*"

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)

            # list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index + x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[" ".join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in ngrams.items():
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return len(result)
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re

        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime

            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print "%s: Query %s" % (thetime, error_message)
            if root:
                return "Bad query"
            else:
                raise ValueError("%s: Query %s" % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs):
        """search for regex in plaintext corpora"""
        import re

        comped = compiler(pattern)
        if comped == "Bad query":
            return "Bad query"
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140])
                    matches.append(match)
        if countmode:
            return len(matches)
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re

        if concordancing:
            pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})"
        compiled_pattern = compiler(pattern)
        if compiled_pattern == "Bad query":
            return "Bad query"
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return len(matches)
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert

        if spelling.lower() == "uk":
            usa_convert = {v: k for k, v in usa_convert.items()}
        spell_out = []
        bits = a_string.split("/")
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = "/".join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re

        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})"
            pat = compiler(pat)
            if pat == "Bad query":
                return "Bad query"
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)

    locs["search"] = search
    locs["query"] = query
    locs["just_speakers"] = just_speakers
    locs["corpus"] = corpus
    locs["multiprocess"] = multiprocess

    if im:
        from corpkit.multiprocess import pmultiquery

        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    # check if just counting
    countmode = "c" in show
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get("denominator", 1)
    startnum = kwargs.get("startnum", 0)

    ############################################
    # Determine the search function to be used #
    ############################################

    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and "t" in search.keys():
        simple_tregex_mode = True
    else:
        if corpus.datatype == "plaintext":
            if search.get("n"):
                raise NotImplementedError("Use a tokenised corpus for n-gramming.")
                # searcher = plaintext_ngram
                optiontext = "n-grams via plaintext"
            if search.get("w"):
                if kwargs.get("regex", True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = "Searching plaintext"

        elif corpus.datatype == "tokens":
            if search.get("n"):
                searcher = tok_ngrams
                optiontext = "n-grams via tokens"
            elif search.get("w"):
                if kwargs.get("regex", True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get("w")) == list:
                    searcher = tok_by_list
                optiontext = "Searching tokens"
        only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"]
        if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()):
            raise ValueError(
                'Need parsed corpus to search with "%s" option(s).'
                % ", ".join([i for i in search.keys() if i in only_parse])
            )

        elif corpus.datatype == "parse":
            if search.get("t"):
                searcher = slow_tregex
            elif search.get("s"):
                searcher = get_stats
                statsmode = True
                optiontext = "General statistics"
                global numdone
                numdone = 0
            else:
                from corpkit.depsearch import dep_searcher

                searcher = dep_searcher
                optiontext = "Dependency querying"

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get("t"):
        query = search.get("t")

        # check the query
        q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root)
        if query is False:
            if root:
                return "Bad query"
            else:
                return

        optiontext = "Searching parse trees"
        if "p" in show or "pl" in show:
            translated_option = "u"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "t" in show:
            translated_option = "o"
            if type(search["t"]) == list:
                search["t"] = r"__ < (/%s/ !< __)" % as_regex(
                    search["t"], boundaries="line", case_sensitive=case_sensitive
                )
            if search["t"] == "any":
                search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)"
        elif "w" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "c" in show:
            count_results = {}
            only_count = True
            translated_option = "C"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"
        elif "l" in show:
            translated_option = "t"
            if type(search["t"]) == list:
                search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive)
            if search["t"] == "any":
                search["t"] = r"/.?[A-Za-z0-9].?/ !< __"

        query = search["t"]

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for k, v in sorted(corpus.structure.items()):
            to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if conc:
        message = "Concordancing"
    else:
        message = "Interrogating"
    if kwargs.get("printstatus", True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = "\n                 ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()])
        if search == {"s": r".*"}:
            sformat = "features"
        welcome = "\n%s: %s %s ...\n          %s\n          Query: %s\n" % (
            thetime,
            message,
            corpus.name,
            optiontext,
            sformat,
        )
        print welcome

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(to_iterate_over.keys())
    else:
        if search.get("s"):
            total_files = sum([len(x) for x in to_iterate_over.values()]) * 12
        else:
            total_files = sum([len(x) for x in to_iterate_over.values()])

    par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files}

    term = None
    if kwargs.get("paralleling", None) is not None:
        from blessings import Terminal

        term = Terminal()
        par_args["terminal"] = term
        par_args["linenum"] = kwargs.get("paralleling")

    outn = kwargs.get("outname", "")
    if outn:
        outn = outn + ": "
    tstr = "%s%d/%d" % (outn, current_iter, total_files)
    p = animator(None, None, init=True, tot_string=tstr, **par_args)
    tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        if countmode or conc:
            results[subcorpus_name] = []
        else:
            results[subcorpus_name] = Counter()

        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ["-o", "-" + translated_option]
            result = tregex_engine(
                query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
            )

            if countmode:
                results[subcorpus_name].append(result)
                continue

            result = Counter(format_tregex(result))

            if conc:
                op.append("-w")
                whole_result = tregex_engine(
                    query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case
                )

                if not only_format_match:
                    whole_result = format_tregex(whole_result)

                result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False)

                if spelling:
                    for index, line in enumerate(result):
                        result[index] = [correct_spelling(b) for b in line]

            results[subcorpus_name] += result

            current_iter += 1
            if kwargs.get("paralleling", None) is not None:
                tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
            else:
                tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:

                if corpus.datatype == "parse":
                    with open(f.path, "r") as data:
                        data = data.read()
                        from corenlp_xml.document import Document

                        try:
                            corenlp_xml = Document(data)
                        except:
                            print "Could not read file: %s" % f.path
                            continue
                        if just_speakers:
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res = searcher(
                            sents,
                            search=search,
                            show=show,
                            dep_type=dep_type,
                            exclude=exclude,
                            excludemode=excludemode,
                            searchmode=searchmode,
                            lemmatise=False,
                            case_sensitive=case_sensitive,
                            concordancing=conc,
                            only_format_match=only_format_match,
                        )

                        if res == "Bad query":
                            return "Bad query"

                        if searcher == slow_tregex and not countmode:
                            res = format_tregex(res)

                elif corpus.datatype == "tokens":
                    import pickle

                    with open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc)
                    if conc:
                        for index, line in enumerate(res):
                            line.insert(0, "")

                elif corpus.datatype == "plaintext":
                    with open(f.path, "rb") as data:
                        data = data.read()
                        data = unicode(data, errors="ignore")
                        res = searcher(search.values()[0], data, concordancing=conc)
                        if conc:
                            for index, line in enumerate(res):
                                line.insert(0, "")

                if countmode:
                    results[subcorpus_name] += res
                    continue

                # add filename and do lowercasing for conc
                if conc:
                    for index, line in enumerate(res):
                        line.insert(0, f.name)
                        if not preserve_case:
                            line = [b.lower() for b in line]
                        if spelling:
                            line = [correct_spelling(b) for b in line]
                        results[subcorpus_name] += [line]

                # do lowercasing and spelling
                else:
                    if not preserve_case:
                        res = [r.lower() for r in res]
                    if spelling:
                        res = [correct_spelling(r) for r in res]
                    results[subcorpus_name] += Counter(res)

                if not statsmode:
                    current_iter += 1
                    if kwargs.get("paralleling", None) is not None:
                        tstr = "%s%d/%d" % (outn, current_iter + 2, total_files)
                    else:
                        tstr = "%s%d/%d" % (outn, current_iter + 1, total_files)

    # delete temp file if there
    import os

    if os.path.isfile("tmp.txt"):
        os.remove("tmp.txt")

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if conc:
        all_conc_lines = []
        for sc_name, resu in sorted(results.items()):

            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            # make into series
            pindex = "c f s l m r".encode("utf-8").split()
            for fname, spkr, start, word, end in unique_results:
                spkr = unicode(spkr, errors="ignore")
                fname = os.path.basename(fname)

                # the use of ascii here makes sure the string formats ok, but will also screw over
                # anyone doing non-english work. so, change to utf-8, then fix errors as they come
                # in the corpkit-gui "add_conc_lines_to_window" function
                all_conc_lines.append(
                    Series(
                        [
                            sc_name.encode("ascii", errors="ignore"),
                            fname.encode("ascii", errors="ignore"),
                            spkr.encode("ascii", errors="ignore"),
                            start.encode("ascii", errors="ignore"),
                            word.encode("ascii", errors="ignore"),
                            end.encode("ascii", errors="ignore"),
                        ],
                        index=pindex,
                    )
                )

        # randomise results...
        if random:
            from random import shuffle

            shuffle(all_conc_lines)

        df = pd.concat(all_conc_lines, axis=1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            df.columns = ["c", "f", "s", "l", "m", "r"]
        else:
            df.columns = ["c", "f", "s", "l", "m", "r", "link"]

        if all(x == "" for x in list(df["s"].values)):
            df.drop("s", axis=1, inplace=True)

        if kwargs.get("note"):
            kwargs["note"].progvar.set(100)

        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index))
            print finalstring

        from corpkit.interrogation import Concordance

        output = Concordance(df)
        output.query = locs
        if quicksave:
            interro.save()
        return output

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    else:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in results.values() for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index=sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis=1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get("df1_always_df"):
                        df = Series(df.ix[0])
                        df.sort(ascending=False)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:
                df.ix["Total-tmp"] = df.sum()
                the_tot = df.ix["Total-tmp"]
                df = df[the_tot.argsort()[::-1]]
                df = df.drop("Total-tmp", axis=0)

        # format final string
        if kwargs.get("printstatus", True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = "\n\n%s: Interrogation finished!" % thetime
            if countmode:
                finalstring += " %d matches." % tot
            else:
                finalstring += " %d unique results, %d total occurrences." % (numentries, total_total)
            print finalstring

        interro = Interrogation(results=df, totals=tot, query=locs)

        if quicksave:
            interro.save()

        return interro
コード例 #8
0
ファイル: multiprocess.py プロジェクト: mrinalgrover/corpkit
def pmultiquery(corpus, 
                search,
                show='words',
                query='any', 
                sort_by='total', 
                save=False,
                multiprocess='default', 
                root=False,
                note=False,
                print_info=True,
                subcorpora=False,
                **kwargs
               ):
    """
    - Parallel process multiple queries or corpora.
    - This function is used by corpkit.interrogator.interrogator()
    - for multiprocessing.
    - There's no reason to call this function yourself.
    """
    import os
    from pandas import DataFrame, Series
    import pandas as pd
    import collections
    from collections import namedtuple, OrderedDict
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.process import canpickle
    try:
        from joblib import Parallel, delayed
    except ImportError:
        pass
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v
    in_notebook = locs.get('in_notebook')

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        import corpkit
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) \
                               if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple = kwargs.get('multiple', False)
    mult_corp_are_subs = False
    if hasattr(corpus, '__iter__'):
        if all(getattr(x, 'level', False) == 's' for x in corpus):
            mult_corp_are_subs = True

    non_first_sub = None
    if subcorpora:
        non_first_sub = subcorpora[1:] if isinstance(subcorpora, list) else None
        subval = subcorpora if not non_first_sub else subcorpora[0]
        #print(subcorpora, non_first_sub, subval)
        if subcorpora is True:
            import re
            subcorpora = re.compile(r'.*')
        else:
            # strange travis error happened here
            subcorpora = corpus.metadata['fields'][subval]
            if len(subcorpora) == 0:
                print('No %s metadata found.' % str(subval))
                return

    mapcores = {'datalist': [corpus, 'corpus'],
                'multiplecorpora': [corpus, 'corpus'],
                'namedqueriessingle': [query, 'query'],
                'namedqueriesmultiple': [search, 'search'],
                'subcorpora': [subcorpora, 'subcorpora']}

    # a is a dummy, just to produce default one
    toiter, itsname = mapcores.get(multiple, [False, False])
    if isinstance(toiter, dict):
        toiter = toiter.items()
    denom = len(toiter)
    num_cores = best_num_parallel(num_cores, denom)

    # todo: code below makes no sense
    vals = ['eachspeaker', 'multiplespeaker', 'namedqueriesmultiple']
    if multiple == 'multiplecorpora' and any(x is True for x in vals):
        from corpkit.corpus import Corpus, Corpora
        if isinstance(corpus, Corpora):
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if isinstance(multiprocess, int):
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure saves are right type
    if save is True:
        raise ValueError('save must be string when multiprocessing.')

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    locs['printstatus'] = False
    locs['multiprocess'] = False
    locs['df1_always_df'] = False
    locs['files_as_subcorpora'] = False
    locs['corpus'] = corpus

    if multiple == 'multiplespeaker':
        locs['multispeaker'] = True

    if isinstance(non_first_sub, list) and len(non_first_sub) == 1:
        non_first_sub = non_first_sub[0]

    # make the default query
    locs = {k: v for k, v in locs.items() if canpickle(v)}
    # make a new dict for every iteration
    ds = [dict(**locs) for i in range(denom)]
    for index, (d, bit) in enumerate(zip(ds, toiter)):
        d['paralleling'] = index
        if multiple in ['namedqueriessingle', 'namedqueriesmultiple']:
            d[itsname] = bit[1]
            d['outname'] = bit[0]
        elif multiple in ['multiplecorpora', 'datalist']:
            d['outname'] = bit.name.replace('-parsed', '')
            d[itsname] = bit
        elif multiple in ['subcorpora']:
            d[itsname] = bit
            jmd = {subval: bit}
            # put this earlier
            j2 = kwargs.get('just_metadata', False)
            if not j2:
                j2 = {}
            jmd.update(j2)
    
            d['just_metadata'] = jmd
            d['outname'] = bit
            d['by_metadata'] = False
            d['subcorpora'] = non_first_sub
            if non_first_sub:
                d['print_info'] = False

    # message printer should be a function...
    if kwargs.get('conc') is False:
        message = 'Interrogating'
    elif kwargs.get('conc') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('conc').lower() == 'only':
        message = 'Concordancing'

    time = strftime("%H:%M:%S", localtime())
    from corpkit.process import dictformat
    
    if print_info:

        # proper printing for plurals
        # in truth this needs to be revised, it's horrible.
        sformat = dictformat(search, query)

        if num_cores == 1:
            add_es = ''
        else:
            add_es = 'es'
        if multiple in ['multiplecorpora', 'datalist']:
            corplist = "\n              ".join([i.name for i in list(corpus)[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, add_es, corplist, sformat, message)))

        elif multiple == 'namedqueriessingle':
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores,  add_es, corpus.name, sformat, message) ))

        elif multiple == 'namedqueriesmultiple':
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message)))

        elif multiple in ['eachspeaker', 'multiplespeaker']:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))
        elif multiple in ['subcorpora']:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    #todo: the number of blank lines to print can be way wrong
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))
            except:
                pass

    if not root and multiprocess:
        try:
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted([i for i in res if i])
        except:
            pass

    # remove unpicklable bits from query
    from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
    badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType)
    qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)}

    if hasattr(qlocs.get('corpus', False), 'name'):
        qlocs['corpus'] = qlocs['corpus'].path
    else:
        qlocs['corpus'] = list([i.path for i in qlocs.get('corpus', [])])

    # return just a concordance
    from corpkit.interrogation import Concordance
    if kwargs.get('conc') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        concs = concs.reset_index(drop=True)
        if kwargs.get('maxconc'):
            concs = concs[:kwargs.get('maxconc')]
        lines = Concordance(concs)
        
        if save:
            lines.save(save, print_info=print_info)

        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' % (thetime, format(len(concs.index), ',')))

        return lines

    # return interrodict (to become multiindex)
    if isinstance(res[0], Interrodict) or not all(isinstance(i.results, Series) for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        idict = Interrodict(out)
        
        if print_info:
            thetime = strftime("%H:%M:%S", localtime())
            print("\n\n%s: Finished! Output is multi-indexed." % thetime)
        idict.query = qlocs

        if save:
            idict.save(save, print_info=print_info)

        return idict

    # make query and total branch, save, return
    # todo: standardise this so we don't have to guess transposes
    # 
    else:
        if multiple == 'multiplecorpora' and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            # make a series from counts
            if all(len(i.results) == 1 for i in res):
                out = pd.concat([r.results for r in res])
                out = out.sort_index()
            else:
                try:
                    out = pd.concat([r.results for r in res], axis=1)
                    out = out.T
                    out.index = [i.query['outname'] for i in res]
                except ValueError:
                    return None
                # format like normal
                # this sorts subcorpora, which are cls
                out = out[sorted(list(out.columns))]
                # puts subcorpora in the right place
                if not mult_corp_are_subs and multiple != 'subcorpora':
                    out = out.T
                if multiple == 'subcorpora':
                    out = out.sort_index()
                out = out.fillna(0) # nan to zero
                out = out.astype(int)
                if 'c' in show and mult_corp_are_subs:
                    out = out.sum()
                    out.index = sorted(list(out.index))

        # sort by total
        if isinstance(out, DataFrame):

            out = out[list(out.sum().sort_values(ascending=False).index)]

            # really need to figure out the deal with tranposing!
            if all(x.endswith('.xml') for x in list(out.columns)) \
            or all(x.endswith('.txt') for x in list(out.columns)) \
            or all(x.endswith('.conll') for x in list(out.columns)):
                out = out.T
                
            if kwargs.get('nosubmode'):
                out = out.sum()
    
        from corpkit.interrogation import Interrogation
        tt = out.sum(axis=1) if isinstance(out, DataFrame) else out.sum()
        out = Interrogation(results=out, totals=tt, query=qlocs)

        if hasattr(out, 'columns') and len(out.columns) == 1:
            out = out.sort_index()   

        if kwargs.get('conc') is True:
            try:
                concs = pd.concat([x.concordance for x in res], ignore_index=True)
                concs = concs.sort_values(by='c')
                concs = concs.reset_index(drop=True)
                if kwargs.get('maxconc'):
                    concs = concs[:kwargs.get('maxconc')]
                out.concordance = Concordance(concs)
            except ValueError:
                out.concordance = None

        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            print(terminal.move(terminal.height-1, 0))
        if print_info:
            if terminal:
                print(terminal.move(terminal.height-1, 0))
            if hasattr(out.results, 'columns'):
                print('%s: Interrogation finished! %s unique results, %s total.' % (thetime, format(len(out.results.columns), ','), format(out.totals.sum(), ',')))
            else:
                print('%s: Interrogation finished! %s matches.' % (thetime, format(tt, ',')))
        if save:
            out.save(save, print_info = print_info)

        if list(out.results.index) == ['0'] and not kwargs.get('df1_always_df'):
            out.results = out.results.ix[0].sort_index()
        return out