Example #1
0
    def edit(self, *args, **kwargs):
        """Delete or keep rows by subcorpus or by middle column text.

        >>> skipped = conc.edit(skip_entries=r'to_?match')"""

        from corpkit.editor import editor
        return editor(self, *args, **kwargs)
Example #2
0
    def edit(self, *args, **kwargs):
        """
        Delete or keep rows by subcorpus or by middle column text.

        >>> skipped = conc.edit(skip_entries=r'to_?match')
        """

        from corpkit.editor import editor
        return editor(self, *args, **kwargs)
Example #3
0
    def edit(self, *args, **kwargs):
        """Edit each value with :func:`~corpkit.interrogation.Interrogation.edit`.

        See :func:`~corpkit.interrogation.Interrogation.edit` for possible arguments.

        :returns: A :class:`corpkit.interrogation.Interrodict`
        """

        from corpkit.editor import editor
        return editor(self, *args, **kwargs)
Example #4
0
    def edit(self, *args, **kwargs):
        """Edit each value with :func:`~corpkit.interrogation.Interrogation.edit`.

        See :func:`~corpkit.interrogation.Interrogation.edit` for possible arguments.

        :returns: A :class:`corpkit.interrogation.Interrodict`
        """

        from corpkit.editor import editor
        return editor(self, *args, **kwargs)
Example #5
0
def multiquery(corpus, query, sort_by = 'total', quicksave = False):
    """Creates a named tuple for a list of named queries to count.

    Pass in something like:

    [[u'NPs in corpus', r'NP'], [u'VPs in corpus', r'VP']]"""

    import collections
    import os
    import pandas
    import pandas as pd
    from time import strftime, localtime
    from corpkit.interrogator import interrogator
    from corpkit.editor import editor

    if quicksave:
        savedir = 'data/saved_interrogations'
        if not quicksave.endswith('.p'):
            quicksave = quicksave + '.p'
        fullpath = os.path.join(savedir, quicksave)
        while os.path.isfile(fullpath):
            selection = raw_input("\nSave error: %s already exists in %s.\n\nPick a new name: " % (savename, savedir))
            if not selection.endswith('.p'):
                selection = selection + '.p'
                fullpath = os.path.join(savedir, selection)

    results = []
    for name, pattern in query:
        result = interrogator(corpus, 'count', pattern)
        result.totals.name = name # rename count
        results.append(result.totals)
    results = pd.concat(results, axis = 1)

    results = editor(results, sort_by = sort_by, print_info = False, keep_stats = False)
    time = strftime("%H:%M:%S", localtime())
    print '%s: Finished! %d unique results, %d total.' % (time, len(results.results.columns), results.totals.sum())
    if quicksave:
        from corpkit.other import save_result
        save_result(results, quicksave)
    return results
Example #6
0
def interrogation_from_conclines(newdata):
    """
    Make new interrogation result from its conc lines
    """
    from collections import Counter
    from pandas import DataFrame
    from corpkit.editor import editor
    results = {}
    conc = newdata
    subcorpora = list(set(conc['c']))
    for subcorpus in subcorpora:
        counted = Counter(list(conc[conc['c'] == subcorpus]['m']))
        results[subcorpus] = counted

    the_big_dict = {}
    unique_results = set([item for sublist in list(results.values()) for item in sublist])
    for word in unique_results:
        the_big_dict[word] = [subcorp_result[word] for name, subcorp_result \
                              in sorted(results.items(), key=lambda x: x[0])]
    # turn master dict into dataframe, sorted
    df = DataFrame(the_big_dict, index=sorted(results.keys())) 
    df = editor(df, sort_by='total', print_info=False)
    df.concordance = conc
    return df
Example #7
0
def interrogator(corpus, 
    search='w', 
    query='any',
    show='w',
    exclude=False,
    excludemode='any',
    searchmode='all',
    case_sensitive=False,
    save=False,
    subcorpora=False,
    just_metadata=False,
    skip_metadata=False,
    preserve_case=False,
    lemmatag=False,
    files_as_subcorpora=False,
    only_unique=False,
    only_format_match=True,
    multiprocess=False,
    spelling=False,
    regex_nonword_filter=r'[A-Za-z0-9]',
    gramsize=1,
    conc=False,
    maxconc=9999,
    window=None,
    no_closed=False,
    no_punct=True,
    discard=False,
    **kwargs):
    """
    Interrogate corpus, corpora, subcorpus and file objects.
    See corpkit.interrogation.interrogate() for docstring
    """
    
    conc = kwargs.get('do_concordancing', conc)
    quiet = kwargs.get('quiet', False)
    coref = kwargs.pop('coref', False)
    show_conc_metadata = kwargs.pop('show_conc_metadata', False)
    fsi_index = kwargs.pop('fsi_index', True)
    dep_type = kwargs.pop('dep_type', 'collapsed-ccprocessed-dependencies')

    nosubmode = subcorpora is None
    #todo: temporary
    #if getattr(corpus, '_dlist', False):
    #    subcorpora = 'file'

    # store kwargs and locs
    locs = locals().copy()
    locs.update(kwargs)
    locs.pop('kwargs', None)

    import codecs
    import signal
    import os
    from time import localtime, strftime
    from collections import Counter

    import pandas as pd
    from pandas import DataFrame, Series

    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.corpus import Datalist, Corpora, Corpus, File, Subcorpus
    from corpkit.process import (tregex_engine, get_deps, unsplitter, sanitise_dict, 
                                 animator, filtermaker, fix_search,
                                 pat_format, auto_usecols, format_tregex,
                                 make_conc_lines_from_whole_mid)
    from corpkit.other import as_regex
    from corpkit.dictionaries.process_types import Wordlist
    from corpkit.build import check_jdk
    from corpkit.conll import pipeline
    from corpkit.process import delete_files_and_subcorpora
    
    have_java = check_jdk()

    # remake corpus without bad files and folders 
    corpus, skip_metadata, just_metadata = delete_files_and_subcorpora(corpus, skip_metadata, just_metadata)

    # so you can do corpus.interrogate('features/postags/wordclasses/lexicon')
    if search == 'features':
        search = 'v'
        query = 'any'
    if search in ['postags', 'wordclasses']:
        query = 'any'
        preserve_case = True
        show = 'p' if search == 'postags' else 'x'
        # use tregex if simple because it's faster
        # but use dependencies otherwise
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
    if search == 'lexicon':
        search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'}
        query = 'any'
        show = ['w']

    if not kwargs.get('cql') and isinstance(search, STRINGTYPE) and len(search) > 3:
        raise ValueError('search argument not recognised.')

    import re
    if regex_nonword_filter:
        is_a_word = re.compile(regex_nonword_filter)
    else:
        is_a_word = re.compile(r'.*')

    from traitlets import TraitError

    # convert cql-style queries---pop for the sake of multiprocessing
    cql = kwargs.pop('cql', None)
    if cql:
        from corpkit.cql import to_corpkit
        search, exclude = to_corpkit(search)

    def signal_handler(signal, _):
        """
        Allow pausing and restarting whn not in GUI
        """
        if root:
            return  
        import signal
        import sys
        from time import localtime, strftime
        signal.signal(signal.SIGINT, original_sigint)
        thetime = strftime("%H:%M:%S", localtime())
        INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
        time = strftime("%H:%M:%S", localtime())
        print('%s: Interrogation resumed.\n' % time)
        signal.signal(signal.SIGINT, signal_handler)

    def add_adj_for_ngram(show, gramsize):
        """
        If there's a gramsize of more than 1, remake show
        for ngramming
        """
        if gramsize == 1:
            return show
        out = []
        for i in show:
            out.append(i)
        for i in range(1, gramsize):
            for bit in show:
                out.append('+%d%s' % (i, bit))
        return out

    def fix_show_bit(show_bit):
        """
        Take a single search/show_bit type, return match
        """
        ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's', 'a', 'e', 'c']
        starts = ['d', 'g', 'm', 'b', 'h', '+', '-', 'r', 'c']
        show_bit = show_bit.lstrip('n')
        show_bit = show_bit.lstrip('b')
        show_bit = list(show_bit)
        if show_bit[-1] not in ends:
            show_bit.append('w')
        if show_bit[0] not in starts:
            show_bit.insert(0, 'm')
        return ''.join(show_bit)

    def fix_show(show, gramsize):
        """
        Lowercase anything in show and turn into list
        """
        if isinstance(show, list):
            show = [i.lower() for i in show]
        elif isinstance(show, STRINGTYPE):
            show = show.lower()
            show = [show]
        show = [fix_show_bit(i) for i in show]
        return add_adj_for_ngram(show, gramsize)

    def is_multiquery(corpus, search, query, outname):
        """
        Determine if multiprocessing is needed/possibe, and 
        do some retyping if need be as well
        """
        is_mul = False
        from collections import OrderedDict
        from corpkit.dictionaries.process_types import Wordlist
        
        if isinstance(query, Wordlist):
            query = list(query)

        if subcorpora and multiprocess:
            is_mul = 'subcorpora'

        if isinstance(subcorpora, (list, tuple)):
            is_mul = 'subcorpora'

        if isinstance(query, (dict, OrderedDict)):
            is_mul = 'namedqueriessingle'
        
        if isinstance(search, dict):
            if all(isinstance(i, dict) for i in list(search.values())):
                is_mul = 'namedqueriesmultiple'
        return is_mul, corpus, search, query

    def ispunct(s):
        import string
        return all(c in string.punctuation for c in s)

    def uniquify(conc_lines):
        """get unique concordance lines"""
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (_, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def compiler(pattern):
        """
        Compile regex or fail gracefully
        """
        if hasattr(pattern, 'pattern'):
            return pattern
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value, exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def determine_search_func(show):
        """Figure out what search function we're using"""

        simple_tregex_mode = False
        statsmode = False
        tree_to_text = False
        search_trees = False
            
        simp_crit = all(not i for i in [kwargs.get('tgrep'),
                                        files_as_subcorpora,
                                        subcorpora,
                                        just_metadata,
                                        skip_metadata])

        if search.get('t') and simp_crit:
            if have_java:
                simple_tregex_mode = True
            else:
                search_trees = 'tgrep'
            optiontext = 'Searching parse trees'

        elif datatype == 'conll':
        
            if any(i.endswith('t') for i in search.keys()):
                if have_java and not kwargs.get('tgrep'):
                    search_trees = 'tregex'
                else:
                    search_trees = 'tgrep'
                optiontext = 'Searching parse trees'
            elif any(i.endswith('v') for i in search.keys()):
                # either of these searchers now seems to work
                #seacher = get_stats_conll
                statsmode = True
                optiontext = 'General statistics'
            elif any(i.endswith('r') for i in search.keys()):
                optiontext = 'Distance from root'
            else:
                optiontext = 'Querying CONLL data'

        return optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees

    def get_tregex_values(show):
        """If using Tregex, set appropriate values

        - Check for valid query
        - Make 'any' query
        - Make list query
        """

        translated_option = 't'
        if isinstance(search['t'], Wordlist):
            search['t'] = list(search['t'])
        q = tregex_engine(corpus=False,
                          query=search.get('t'),
                          options=['-t'],
                          check_query=True,
                          root=root,
                          preserve_case=preserve_case
                         )

        # so many of these bad fixing loops!
        nshow = []
        for i in show:
            if i == 'm':
                nshow.append('w')
            else:
                nshow.append(i.lstrip('m'))
        show = nshow

        if q is False:
            if root:
                return 'Bad query', None
            else:
                return 'Bad query', None

        if isinstance(search['t'], list):
            regex = as_regex(search['t'], boundaries='line', case_sensitive=case_sensitive)
        else:
            regex = ''

        # listquery, anyquery, translated_option
        treg_dict = {'p': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'pl': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     'x': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'],
                     't': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'o'],
                     'w': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'c': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'C'],
                     'l': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'],
                     'u': [r'/%s/ !< __'  % regex, r'/.?[A-Za-z0-9].?/ !< __', 'v']
                    }

        newshow = []

        listq, anyq, translated_option = treg_dict.get(show[0][-1].lower())
        newshow.append(translated_option)
        for item in show[1:]:
            _, _, noption = treg_dict.get(item.lower())
            newshow.append(noption)

        if isinstance(search['t'], list):
            search['t'] = listq
        elif search['t'] == 'any':   
            search['t'] = anyq
        return search['t'], newshow

    def correct_spelling(a_string):
        """correct spelling within a string"""
        if not spelling:
            return a_string
        from corpkit.dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def make_search_iterable(corpus):
        """determine how to structure the corpus for interrogation"""
        # skip file definitions if they are not needed
        if getattr(corpus, '_dlist', False):

            return {(i.name, i.path): [i] for i in list(corpus.files)}
            #return {('Sample', 'Sample'): list(corpus.files)}

        if simple_tregex_mode:
            if corpus.level in ['s', 'f', 'd']:
                return {(corpus.name, corpus.path): False}
            else:
                return {(os.path.basename(i), os.path.join(corpus.path, i)): False
                    for i in os.listdir(corpus.path)
                    if os.path.isdir(os.path.join(corpus.path, i))}

        if isinstance(corpus, Datalist):
            to_iterate_over = {}
            # it could be files or subcorpus objects
            if corpus[0].level in ['s', 'd']:
                if files_as_subcorpora:
                    for subc in corpus:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subc in corpus:
                        to_iterate_over[(subc.name, subc.path)] = subc.files
            elif corpus[0].level == 'f':
                for f in corpus:
                    to_iterate_over[(f.name, f.path)] = [f]
        elif corpus.singlefile:
            to_iterate_over = {(corpus.name, corpus.path): [corpus]}
        elif not hasattr(corpus, 'subcorpora') or not corpus.subcorpora:
            # just files in a directory
            if files_as_subcorpora:
                to_iterate_over = {}
                for f in corpus.files:
                    to_iterate_over[(f.name, f.path)] = [f]
            else:
                to_iterate_over = {(corpus.name, corpus.path): corpus.files}
        else:
            to_iterate_over = {}
            if files_as_subcorpora:
                # don't know if possible: has subcorpora but also .files
                if hasattr(corpus, 'files') and corpus.files is not None:
                    for f in corpus.files:
                        to_iterate_over[(f.name, f.path)] = [f]
                # has subcorpora with files in those
                elif hasattr(corpus, 'files') and corpus.files is None:
                    for subc in corpus.subcorpora:
                        for f in subc.files:
                            to_iterate_over[(f.name, f.path)] = [f]
            else:
                if corpus[0].level == 's':
                    for subcorpus in corpus:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
                elif corpus[0].level == 'f':
                    for f in corpus:
                        to_iterate_over[(f.name, f.path)] = [f]
                else:
                    for subcorpus in corpus.subcorpora:
                        to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        return to_iterate_over

    def welcome_printer(return_it=False):
        """Print welcome message"""
        if no_conc:
            message = 'Interrogating'
        else:
            message = 'Interrogating and concordancing'
        if only_conc:
            message = 'Concordancing'
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            from corpkit.process import dictformat
            sformat = dictformat(search)
            welcome = ('\n%s: %s %s ...\n          %s\n          ' \
                        'Query: %s\n          %s corpus ... \n' % \
                      (thetime, message, cname, optiontext, sformat, message))
            if return_it:
                return welcome
            else:
                print(welcome)

    def goodbye_printer(return_it=False, only_conc=False):
        """Say goodbye before exiting"""
        if not kwargs.get('printstatus', True):
            return
        thetime = strftime("%H:%M:%S", localtime())
        if only_conc:
            finalstring = '\n\n%s: Concordancing finished! %s results.' % (thetime, format(len(conc_df), ','))
        else:
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %s matches.' % format(tot, ',')
            else:
                finalstring += ' %s unique results, %s total occurrences.' % (format(numentries, ','), format(total_total, ','))
        if return_it:
            return finalstring
        else:
            print(finalstring)

    def get_conc_colnames(corpus,
                          fsi_index=False,
                          simple_tregex_mode=False):
    
        fields = []
        base = 'c f s l m r'
        
        if simple_tregex_mode:
            base = base.replace('f ', '')

        if fsi_index and not simple_tregex_mode:
            base = 'i ' + base
        
        if PYTHON_VERSION == 2:
            base = base.encode('utf-8').split()
        else:
            base = base.split() 

        if show_conc_metadata:
            from corpkit.build import get_all_metadata_fields
            meta = get_all_metadata_fields(corpus.path)

            if isinstance(show_conc_metadata, list):
                meta = [i for i in meta if i in show_conc_metadata]
            #elif show_conc_metadata is True:
            #    pass
            for i in sorted(meta):
                if i in ['speaker', 'sent_id', 'parse']:
                    continue
                if PYTHON_VERSION == 2:
                    base.append(i.encode('utf-8'))
                else:
                    base.append(i)
        return base

    def make_conc_obj_from_conclines(conc_results, fsi_index=False):
        """
        Turn conclines into DataFrame
        """
        from corpkit.interrogation import Concordance
        #fsi_place = 2 if fsi_index else 0

        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            for lin in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                #if not subcorpora:
                #    lin[fsi_place] = lin[fsi_place]
                #lin.insert(fsi_place, sc_name)

                if len(lin) < len(conc_col_names):
                    diff = len(conc_col_names) - len(lin)
                    lin.extend(['none'] * diff)

                all_conc_lines.append(Series(lin, index=conc_col_names))

        try:
            conc_df = pd.concat(all_conc_lines, axis=1).T
        except ValueError:
            return
        
        if all(x == '' for x in list(conc_df['s'].values)) or \
           all(x == 'none' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis=1, inplace=True)

        locs['corpus'] = corpus.name

        if maxconc:
            conc_df = Concordance(conc_df[:maxconc])
        else:
            conc_df = Concordance(conc_df)
        try:
            conc_df.query = locs
        except AttributeError:
            pass
        return conc_df

    def lowercase_result(res):
        """      
        Take any result and do spelling/lowercasing if need be

        todo: remove lowercase and change name
        """
        if not res or statsmode:
            return res
        # this is likely broken, but spelling in interrogate is deprecated anyway
        if spelling:
            res = [correct_spelling(r) for r in res]
        return res

    def postprocess_concline(line, fsi_index=False, conc=False):
        # todo: are these right?
        if not conc:
            return line
        subc, star, en = 0, 2, 5
        if fsi_index:
            subc, star, en = 2, 4, 7
        if not preserve_case:
            line[star:en] = [str(x).lower() for x in line[star:en]]
        if spelling:
            line[star:en] = [correct_spelling(str(b)) for b in line[star:en]]
        return line

    def make_progress_bar():
        """generate a progress bar"""

        if simple_tregex_mode:
            total_files = len(list(to_iterate_over.keys()))
        else:
            total_files = sum(len(x) for x in list(to_iterate_over.values()))

        par_args = {'printstatus': kwargs.get('printstatus', True),
                    'root': root, 
                    'note': note,
                    'quiet': quiet,
                    'length': total_files,
                    'startnum': kwargs.get('startnum'),
                    'denom': kwargs.get('denominator', 1)}

        term = None
        if kwargs.get('paralleling', None) is not None:
            from blessings import Terminal
            term = Terminal()
            par_args['terminal'] = term
            par_args['linenum'] = kwargs.get('paralleling')

        if in_notebook:
            par_args['welcome_message'] = welcome_message

        outn = kwargs.get('outname', '')
        if outn:
            outn = getattr(outn, 'name', outn)
            outn = outn + ': '

        tstr = '%s%d/%d' % (outn, current_iter, total_files)
        p = animator(None, None, init=True, tot_string=tstr, **par_args)
        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
        animator(p, current_iter, tstr, **par_args)
        return p, outn, total_files, par_args

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')
    language_model = kwargs.get('language_model')

    # set up pause method
    original_sigint = signal.getsignal(signal.SIGINT)
    if kwargs.get('paralleling', None) is None:
        if not root:
            original_sigint = signal.getsignal(signal.SIGINT)
            signal.signal(signal.SIGINT, signal_handler)

    # find out about concordancing
    only_conc = False
    no_conc = False
    if conc is False:
        no_conc = True
    if isinstance(conc, str) and conc.lower() == 'only':
        only_conc = True
        no_conc = False
    numconc = 0

    # wipe non essential class attributes to not bloat query attrib
    if isinstance(corpus, Corpus):
        import copy
        corpus = copy.copy(corpus)
        for k, v in corpus.__dict__.items():
            if isinstance(v, (Interrogation, Interrodict)):
                corpus.__dict__.pop(k, None)

    # convert path to corpus object
    if not isinstance(corpus, (Corpus, Corpora, Subcorpus, File, Datalist)):
        if not multiprocess and not kwargs.get('outname'):
            corpus = Corpus(corpus, print_info=False)

    # figure out how the user has entered the query and show, and normalise
    from corpkit.process import searchfixer
    search = searchfixer(search, query)
    show = fix_show(show, gramsize)
    locs['show'] = show

    # instantiate lemmatiser if need be
    lem_instance = False
    if any(i.endswith('l') for i in show) and isinstance(search, dict) and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lem_instance = WordNetLemmatizer()

    # do multiprocessing if need be
    im, corpus, search, query, = is_multiquery(corpus, search, query, 
                                                             kwargs.get('outname', False))

    # figure out if we can multiprocess the corpus
    if hasattr(corpus, '__iter__') and im:
        corpus = Corpus(corpus, print_info=False)
    if hasattr(corpus, '__iter__') and not im:
        im = 'datalist'
    if isinstance(corpus, Corpora):
        im = 'multiplecorpora'

    # split corpus if the user wants multiprocessing but no other iterable
    if not im and multiprocess:
        im = 'datalist'
        if getattr(corpus, 'subcorpora', False):
            corpus = corpus[:]
        else:
            corpus = corpus.files

    search = fix_search(search, case_sensitive=case_sensitive, root=root)
    exclude = fix_search(exclude, case_sensitive=case_sensitive, root=root)

    # if it's already been through pmultiquery, don't do it again
    locs['search'] = search
    locs['exclude'] = exclude
    locs['query'] = query
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess
    locs['print_info'] = kwargs.get('printstatus', True)
    locs['multiple'] = im
    locs['subcorpora'] = subcorpora
    locs['nosubmode'] = nosubmode

    # send to multiprocess function
    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from corpkit.multiprocess import pmultiquery
        return pmultiquery(**locs)

    # get corpus metadata
    cname = corpus.name
    if isinstance(save, STRINGTYPE):
        savename = corpus.name + '-' + save
    if save is True:
        raise ValueError('save must be str, not bool.')


    datatype = getattr(corpus, 'datatype', 'conll')
    singlefile = getattr(corpus, 'singlefile', False)
    level = getattr(corpus, 'level', 'c')
        
    # store all results in here
    from collections import defaultdict
    results = defaultdict(Counter)
    count_results = defaultdict(list)
    conc_results = defaultdict(list)

    # check if just counting, turn off conc if so
    countmode = 'c' in show or 'mc' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    # Determine the search function to be used #
    optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees = determine_search_func(show)
    
    # no conc for statsmode
    if statsmode:
        no_conc = True
        only_conc = False
        conc = False

    # Set some Tregex-related values
    translated_option = False
    if search.get('t'):
        query, translated_option = get_tregex_values(show)
        if query == 'Bad query' and translated_option is None:
            if root:
                return 'Bad query'
            else:
                return
    # more tregex options
    if tree_to_text:
        treg_q = r'ROOT << __'
        op = ['-o', '-t', '-w', '-f']
    elif simple_tregex_mode:
        treg_q = search['t']
        op = ['-%s' % i for i in translated_option] + ['-o', '-f']

    # make iterable object for corpus interrogation
    to_iterate_over = make_search_iterable(corpus)

    try:
        from ipywidgets import IntProgress
        _ = IntProgress(min=0, max=10, value=1)
        in_notebook = True
    except TraitError:
        in_notebook = False
    except ImportError:
        in_notebook = False
    # caused in newest ipython
    except AttributeError:
        in_notebook = False

    lemtag = False
    if search.get('t'):
        from corpkit.process import gettag
        lemtag = gettag(search.get('t'), lemmatag)

    usecols = auto_usecols(search, exclude, show, kwargs.pop('usecols', None), coref=coref)

    # print welcome message
    welcome_message = welcome_printer(return_it=in_notebook)

    # create a progress bar
    p, outn, total_files, par_args = make_progress_bar()

    if conc:
        conc_col_names = get_conc_colnames(corpus,
                                           fsi_index=fsi_index,
                                           simple_tregex_mode=False)

 

    # Iterate over data, doing interrogations
    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):
        if nosubmode:
            subcorpus_name = 'Total'

        # results for subcorpus go here
        #conc_results[subcorpus_name] = []
        #count_results[subcorpus_name] = []
        #results[subcorpus_name] = Counter()

        # get either everything (tree_to_text) or the search['t'] query
        if tree_to_text or simple_tregex_mode:
            result = tregex_engine(query=treg_q,
                                   options=op,
                                   corpus=subcorpus_path,
                                   root=root,
                                   preserve_case=preserve_case)

            # format search results with slashes etc
            if not countmode and not tree_to_text:
                result = format_tregex(result, show, translated_option=translated_option,
                            exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False)

            # if concordancing, do the query again with 'whole' sent and fname
            if not no_conc:
                ops = ['-w'] + op
                #ops = [i for i in ops if i != '-n']
                whole_result = tregex_engine(query=search['t'],
                                             options=ops,
                                             corpus=subcorpus_path,
                                             root=root,
                                             preserve_case=preserve_case
                                            )

                # format match too depending on option
                if not only_format_match:
                    wholeresult = format_tregex(whole_result, show, translated_option=translated_option,
                                exclude=exclude, excludemode=excludemode, lemtag=lemtag,
                            lem_instance=lem_instance, countmode=countmode, speaker_data=False, whole=True)

                # make conc lines from conc results
                conc_result = make_conc_lines_from_whole_mid(whole_result, result, show=show)
                for lin in conc_result:
                    if maxconc is False or numconc < maxconc:
                        conc_results[subcorpus_name].append(lin)
                    numconc += 1

            # add matches to ongoing counts
            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                if result:
                    results[subcorpus_name] += Counter([i[-1] for i in result])
                else:
                    results[subcorpus_name] += Counter()

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)
            continue

        # todo: move this
        kwargs.pop('by_metadata', None)
        
        # conll querying goes by file, not subcorpus
        for f in files:
            slow_treg_speaker_guess = kwargs.get('outname', '') if kwargs.get('multispeaker') else ''
            filepath, corefs = f.path, coref
            res, conc_res = pipeline(filepath, search=search, show=show,
                                     dep_type=dep_type,
                                     exclude=exclude,
                                     excludemode=excludemode,
                                     searchmode=searchmode,
                                     case_sensitive=case_sensitive,
                                     conc=conc,
                                     only_format_match=only_format_match,
                                     speaker=slow_treg_speaker_guess,
                                     gramsize=gramsize,
                                     no_punct=no_punct,
                                     no_closed=no_closed,
                                     window=window,
                                     filename=f.path,
                                     coref=corefs,
                                     countmode=countmode,
                                     maxconc=(maxconc, numconc),
                                     is_a_word=is_a_word,
                                     by_metadata=subcorpora,
                                     show_conc_metadata=show_conc_metadata,
                                     just_metadata=just_metadata,
                                     skip_metadata=skip_metadata,
                                     fsi_index=fsi_index,
                                     category=subcorpus_name,
                                     translated_option=translated_option,
                                     statsmode=statsmode,
                                     preserve_case=preserve_case,
                                     usecols=usecols,
                                     search_trees=search_trees,
                                     lem_instance=lem_instance,
                                     lemtag=lemtag,
                                     **kwargs)

            if res is None and conc_res is None:
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # deal with symbolic structures---that is, rather than adding
            # results by subcorpora, add them by metadata value
            # todo: sorting?
            if subcorpora:
                for (k, v), concl in zip(res.items(), conc_res.values()):                            
                    v = lowercase_result(v)
                    results[k] += Counter(v)
                    for line in concl:
                        if maxconc is False or numconc < maxconc:
                            line = postprocess_concline(line,
                                fsi_index=fsi_index, conc=conc)
                            conc_results[k].append(line)
                            numconc += 1
                
                current_iter += 1
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                animator(p, current_iter, tstr, **par_args)
                continue

            # garbage collection needed?
            sents = None
            corefs = None
                
            if res == 'Bad query':
                return 'Bad query'

            if countmode:
                count_results[subcorpus_name] += [res]

            else:
                # add filename and do lowercasing for conc
                if not no_conc:
                    for line in conc_res:
                        line = postprocess_concline(line,
                            fsi_index=fsi_index, conc=conc)
                        if maxconc is False or numconc < maxconc:
                            conc_results[subcorpus_name].append(line)
                            numconc += 1

                # do lowercasing and spelling
                if not only_conc:
                    res = lowercase_result(res)
                    # discard removes low results, helping with 
                    # curse of dimensionality
                    countres = Counter(res)
                    if isinstance(discard, float):
                        countres.most_common()
                        nkeep = len(counter) - len(counter) * discard
                        countres = Counter({k: v for i, (k, v) in enumerate(countres.most_common()) if i <= nkeep})
                    elif isinstance(discard, int):
                        countres = Counter({k: v for k, v in countres.most_common() if v >= discard})
                    results[subcorpus_name] += countres
                    #else:
                    #results[subcorpus_name] += res

            # update progress bar
            current_iter += 1
            tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            animator(p, current_iter, tstr, **par_args)

    # Get concordances into DataFrame, return if just conc
    if not no_conc:
        # fail on this line with typeerror if no results?
        conc_df = make_conc_obj_from_conclines(conc_results, fsi_index=fsi_index)
        if only_conc and conc_df is None:
            return
        elif only_conc:
            locs = sanitise_dict(locs)
            try:
                conc_df.query = locs
            except AttributeError:
                return conc_df
            if save and not kwargs.get('outname'):
                if conc_df is not None:
                    conc_df.save(savename)
            goodbye_printer(only_conc=True)
            if not root:
                signal.signal(signal.SIGINT, original_sigint)            
            return conc_df
    else:
        conc_df = None

    # Get interrogation into DataFrame
    if countmode:
        df = Series({k: sum(v) for k, v in sorted(count_results.items())})
        tot = df.sum()
    else:
        the_big_dict = {}
        unique_results = set(item for sublist in list(results.values()) for item in sublist)
        sortres = sorted(results.items(), key=lambda x: x[0])
        for word in unique_results:
            the_big_dict[word] = [subcorp_result[word] for _, subcorp_result in sortres]
        # turn master dict into dataframe, sorted
        df = DataFrame(the_big_dict, index=sorted(results.keys()))

        # for ngrams, remove hapaxes
        #if show_ngram or show_collocates:
        #    if not language_model:
        #        df = df[[i for i in list(df.columns) if df[i].sum() > 1]]

        numentries = len(df.columns)
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    # turn df into series if all conditions met
    conds = [countmode,
             files_as_subcorpora,
             subcorpora,
             kwargs.get('df1_always_df', False)]
    anyxs = [level == 's',
             singlefile,
             nosubmode]
    if all(not x for x in conds) and any(x for x in anyxs):
        df = Series(df.ix[0])
        df.sort_values(ascending=False, inplace=True)
        tot = df.sum()
        numentries = len(df.index)
        total_total = tot

    # turn data into DF for GUI if need be
    if isinstance(df, Series) and kwargs.get('df1_always_df', False):
        total_total = df.sum()
        df = DataFrame(df)
        tot = Series(total_total, index=['Total'])

    # if we're doing files as subcorpora,  we can remove the extension etc
    if isinstance(df, DataFrame) and files_as_subcorpora:
        cname = corpus.name.replace('-stripped', '').replace('-parsed', '')
        edits = [(r'(-[0-9][0-9][0-9])?\.txt\.conllu?', ''),
                 (r'-%s(-stripped)?(-parsed)?' % cname, '')]
        from corpkit.editor import editor
        df = editor(df, replace_subcorpus_names=edits).results
        tot = df.sum(axis=1)
        total_total = df.sum().sum()

    if conc_df is not None and conc_df is not False:
        # removed 'f' from here for now
        for col in ['c']:
            for pat in ['.txt', '.conll', '.conllu']:
                conc_df[col] = conc_df[col].str.replace(pat, '')
            conc_df[col] = conc_df[col].str.replace(r'-[0-9][0-9][0-9]$', '')

        #df.index = df.index.str.replace('w', 'this')

    # make interrogation object
    locs['corpus'] = corpus.path
    locs = sanitise_dict(locs)
    if nosubmode and isinstance(df, pd.DataFrame):
        df = df.sum()
    interro = Interrogation(results=df, totals=tot, query=locs, concordance=conc_df)

    # save it
    if save and not kwargs.get('outname'):
        print('\n')
        interro.save(savename)
    
    goodbye = goodbye_printer(return_it=in_notebook)
    if in_notebook:
        try:
            p.children[2].value = goodbye.replace('\n', '')
        except AttributeError:
            pass
    if not root:
        signal.signal(signal.SIGINT, original_sigint)
    return interro
Example #8
0
    def edit(self, *args, **kwargs):
        """
        Manipulate results of interrogations.

        There are a few overall kinds of edit, most of which can be combined 
        into a single function call. It's useful to keep in mind that many are 
        basic wrappers around `pandas` operations---if you're comfortable with 
        `pandas` syntax, it may be faster at times to use its syntax instead.

        :Basic mathematical operations:

        First, you can do basic maths on results, optionally passing in some 
        data to serve as the denominator. Very commonly, you'll want to get 
        relative frequencies:

        :Example: 

        >>> data = corpus.interrogate({W: r'^t'})
        >>> rel = data.edit('%', SELF)
        >>> rel.results
            ..    to  that   the  then ...   toilet  tolerant  tolerate  ton
            01 18.50 14.65 14.44  6.20 ...     0.00      0.00      0.11 0.00
            02 24.10 14.34 13.73  8.80 ...     0.00      0.00      0.00 0.00
            03 17.31 18.01  9.97  7.62 ...     0.00      0.00      0.00 0.00

        For the operation, there are a number of possible values, each of 
        which is to be passed in as a `str`:

           `+`, `-`, `/`, `*`, `%`: self explanatory

           `k`: calculate keywords

           `a`: get distance metric
        
        `SELF` is a very useful shorthand denominator. When used, all editing 
        is performed on the data. The totals are then extracted from the edited 
        data, and used as denominator. If this is not the desired behaviour, 
        however, a more specific `interrogation.results` or 
        `interrogation.totals` attribute can be used.

        In the example above, `SELF` (or `'self'`) is equivalent to:

        :Example:

        >>> rel = data.edit('%', data.totals)

        :Keeping and skipping data:

        There are four keyword arguments that can be used to keep or skip rows 
        or columns in the data:

        * `just_entries`
        * `just_subcorpora`
        * `skip_entries`
        * `skip_subcorpora`

        Each can accept different input types:

        * `str`: treated as regular expression to match
        * `list`: 

          * of integers: indices to match
          * of strings: entries/subcorpora to match

        :Example:

        >>> data.edit(just_entries=r'^fr', 
        ...           skip_entries=['free','freedom'],
        ...           skip_subcorpora=r'[0-9]')

        :Merging data:

        There are also keyword arguments for merging entries and subcorpora:

        * `merge_entries`
        * `merge_subcorpora`

        These take a `dict`, with the new name as key and the criteria as 
        value. The criteria can be a str (regex) or wordlist.

        :Example:
        
        >>> from dictionaries.wordlists import wordlists
        >>> mer = {'Articles': ['the', 'an', 'a'], 'Modals': wordlists.modals}
        >>> data.edit(merge_entries=mer)

        :Sorting:

        The `sort_by` keyword argument takes a `str`, which represents the way 
        the result columns should be ordered.

        * `increase`: highest to lowest slope value
        * `decrease`: lowest to highest slope value
        * `turbulent`: most change in y axis values
        * `static`: least change in y axis values
        * `total/most`: largest number first
        * `infreq/least`: smallest number first
        * `name`: alphabetically

        :Example:

        >>> data.edit(sort_by='increase')

        :Editing text:
        
        Column labels, corresponding to individual interrogation results, can 
        also be edited with `replace_names`.

        :param replace_names: Edit result names, then merge duplicate entries
        :type replace_names: `str`/`list of tuples`/`dict`

        If `replace_names` is a string, it is treated as a regex to delete from 
        each name. If `replace_names` is a dict, the value is the regex, and 
        the key is the replacement text. Using a list of tuples in the form 
        `(find, replacement)` allows duplicate substitution values.

        :Example:

        >>> data.edit(replace_names={r'object': r'[di]obj'})

        :param replace_subcorpus_names: Edit subcorpus names, then merge duplicates.
                                        The same as `replace_names`, but on the other axis.
        :type replace_subcorpus_names: `str`/`list of tuples`/`dict`

        :Other options:

        There are many other miscellaneous options.

        :param keep_stats: Keep/drop stats values from dataframe after sorting
        :type keep_stats: `bool`
        
        :param keep_top: After sorting, remove all but the top *keep_top* results
        :type keep_top: `int`
        
        :param just_totals: Sum each column and work with sums
        :type just_totals: `bool`
        
        :param threshold: When using results list as dataframe 2, drop values 
                          occurring fewer than n times. If not keywording, you 
                          can use:
                                
           `'high'`: `denominator total / 2500`
           
           `'medium'`: `denominator total / 5000`
           
           `'low'`: `denominator total / 10000`
                            
           If keywording, there are smaller default thresholds

        :type threshold: `int`/`bool`

        :param span_subcorpora: If subcorpora are numerically named, span all 
                                from *int* to *int2*, inclusive
        :type span_subcorpora: `tuple` -- `(int, int2)`

        :param projection: multiply results in subcorpus by n
        :type projection: tuple -- `(subcorpus_name, n)`
        :param remove_above_p: Delete any result over `p`
        :type remove_above_p: `bool`

        :param p: set the p value
        :type p: `float`
        
        :param revert_year: When doing linear regression on years, turn annual 
                            subcorpora into 1, 2 ...
        :type revert_year: `bool`
        
        :param print_info: Print stuff to console showing what's being edited
        :type print_info: `bool`
        
        :param spelling: Convert/normalise spelling:
        :type spelling: `str` -- `'US'`/`'UK'`

        :Keywording options:

        If the operation is `k`, you're calculating keywords. In this case,
        some other keyword arguments have an effect:

        :param keyword_measure: what measure to use to calculate keywords:

           `ll`: log-likelihood
           `pd': percentage difference

        type keyword_measure: `str`
        
        :param selfdrop: When keywording, try to remove target corpus from 
                         reference corpus
        :type selfdrop: `bool`
        
        :param calc_all: When keywording, calculate words that appear in either 
                         corpus
        :type calc_all: `bool`

        :returns: :class:`corpkit.interrogation.Interrogation`
        """
        from corpkit.editor import editor
        return editor(self, *args, **kwargs)
Example #9
0
 def sort(self, way, **kwargs):
     from corpkit.editor import editor
     return editor(self, sort_by=way, **kwargs)
Example #10
0
def editor(dataframe1, 
            operation = None,
            dataframe2 = False,
            sort_by = False,
            keep_stats = False,
            keep_top = False,
            just_totals = False,
            threshold = 'medium',
            just_entries = False,
            skip_entries = False,
            merge_entries = False,
            newname = 'combine',
            multiple_merge = False,
            just_subcorpora = False,
            skip_subcorpora = False,
            span_subcorpora = False,
            merge_subcorpora = False,
            new_subcorpus_name = False,
            replace_names = False,
            projection = False,
            remove_above_p = False,
            p = 0.05, 
            revert_year = True,
            print_info = True,
            spelling = False,
            selfdrop = True,
            calc_all = True,
            **kwargs
            ):
    """Edit results of interrogations, do keywording, sort, etc.

    ``just/skip_entries`` and ``just/skip_subcorpora`` can take a few different kinds of input:

    * str: treated as regular expression to match
    * list: 

      * of integers: indices to match
      * of strings: entries/subcorpora to match

    ``merge_entries`` and ``merge_subcorpora``, however, are best entered as dicts:

    ``{newname: criteria, newname2: criteria2}```

    where criteria is a string, list, etc.

    :param dataframe1: Results to edit
    :type dataframe1: pandas.core.frame.DataFrame
    
    :param operation: Kind of maths to do on inputted lists:
                            '+', '-', '/', '*', '%': self explanatory
                            'k': log likelihood (keywords)
                            'a': get distance metric (for use with interrogator 'a' option)
                            'd': get percent difference (alternative approach to keywording)
    :type operation: str
    
    :param dataframe2: List of results or totals.
                            If list of results, for each entry in dataframe 1, locate
                            entry with same name in dataframe 2, and do maths there
                            if 'self', do all merging/keeping operations, then use
                            edited dataframe1 as dataframe2
    :type dataframe2: pandas.core.series.Series/pandas.core.frame.DataFrame/dict/'self'
    
    :param sort_by: Calculate slope, stderr, r, p values, then sort by:
                            increase: highest to lowest slope value
                            decrease: lowest to highest slope value
                            turbulent: most change in y axis values
                            static: least change in y axis values
                            total/most: largest number first
                            infreq/least: smallest number first
                            name: alphabetically
    :type sort_by: str

    :param keep_stats: Keep/drop stats values from dataframe after sorting
    :type keep_stats: bool
    
    :param keep_top: After sorting, remove all but the top *keep_top* results
    :type keep_top: int
    
    :param just_totals: Sum each column and work with sums
    :type just_totals: bool
    
    :param threshold: When using results list as dataframe 2, drop values occurring
                        fewer than n times. If not keywording, you can use:
                            ``'high'``: dataframe2 total / 2500
                            ``'medium'``: dataframe2 total / 5000
                            ``'low'``: dataframe2 total / 10000
                        Note: if keywording, there are smaller default thresholds
    :type threshold: int/bool
    :param just_entries: Keep matching entries
    :type just_entries: see above
    :param skip_entries: Skip matching entries
    :type skip_entries: see above
    :param merge_entries: Merge matching entries
    :type merge_entries: see above
    :param newname: New name for merged entries
    :type newname: str/'combine'
    :param just_subcorpora: Keep matching subcorpora
    :type just_subcorpora: see above
    :param skip_subcorpora: Skip matching subcorpora
    :type skip_subcorpora: see above
    :param span_subcorpora: If subcorpora are numerically named, span all from *int* to *int2*, inclusive
    :type span_subcorpora: tuple -- ``(int, int2)``
    :param merge_subcorpora: Merge matching subcorpora
    :type merge_subcorpora: see above
    :param new_subcorpus_name: Name for merged subcorpora
    :type new_subcorpus_name: str/``'combine'``

    :param replace_names: Edit result names and then merge duplicate names.
    :type replace_names: dict -- ``{criteria: replacement_text}``; str -- a regex to delete from names
    :param projection:         a  to multiply results in subcorpus by n
    :type projection: tuple -- ``(subcorpus_name, n)``
    :param remove_above_p: Delete any result over p
    :type remove_above_p: bool
    :param p:                  set the p value
    :type p: float
    
    :param revert_year:        when doing linear regression on years, turn annual subcorpora into 1, 2 ...
    :type revert_year: bool
    
    :param print_info: Print stuff to console showing what's being edited
    :type print_info: bool
    
    :param spelling: Convert/normalise spelling:
    :type spelling: str -- ``'US'``/``'UK'``
    
    :param selfdrop: When keywording, try to remove target corpus from reference corpus
    :type selfdrop: bool
    
    :param calc_all: When keywording, calculate words that appear in either corpus
    :type calc_all: bool

    :returns: Edited interrogation
    """

    # grab arguments, in case we get dict input and have to iterate
    saved_args = locals()

    import corpkit
    import pandas
    import signal
    import re
    import collections
    import pandas as pd
    import numpy as np

    from pandas import DataFrame, Series
    from time import localtime, strftime
    
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        have_ipython = False
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    # if passing a multiquery, do each result separately and return
    if type(dataframe1) == dict:
        outdict = {}
        from corpkit.editor import editor
        del saved_args['dataframe1']
        for i, (k, v) in enumerate(dataframe1.items()):
            # only print the first time around
            if i == 0:
                pass
                #saved_args['print_info'] = True
            else:
                saved_args['print_info'] = False
            # if df2 is also a dict, get the relevant entry
            if type(dataframe2) == dict:
                if sorted(set([i.lower() for i in dataframe1.keys()])) == \
                   sorted(set([i.lower() for i in dataframe2.keys()])):
                   saved_args['dataframe2'] = dataframe2[k]
                   if 'use_df2_totals' in kwargs.keys():
                       if kwargs['use_df2_totals'] is True:
                            saved_args['dataframe2'] = dataframe2[k].totals
            outdict[k] = editor(v.results, **saved_args)
        if print_info:
            from time import localtime, strftime
            thetime = strftime("%H:%M:%S", localtime())
            print "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (thetime, "'\n         '".join(sorted(outdict.keys())))
        return outdict

    the_time_started = strftime("%Y-%m-%d %H:%M:%S")

    pd.options.mode.chained_assignment = None
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    from corpkit.tests import check_pytex
    if check_pytex():
        print_info = False

    def combiney(df, df2, operation = '%', threshold = 'medium', prinf = True):
        """mash df and df2 together in appropriate way"""
        totals = False
        # delete under threshold
        if just_totals:
            if using_totals:
                if not single_totals:
                    to_drop = list(df2[df2['Combined total'] < threshold].index)
                    df = df.drop([e for e in to_drop if e in list(df.index)])
                    if prinf:
                        to_show = []
                        [to_show.append(w) for w in to_drop[:5]]
                        if len(to_drop) > 10:
                            to_show.append('...')
                            [to_show.append(w) for w in to_drop[-5:]]
                        if len(to_drop) > 0:
                            print 'Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show))
                        if len(to_drop) > 10:
                            print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)
                        else:
                            print ''
                else:
                    denom = df2
        else:
            denom = list(df2)
        if single_totals:
            if operation == '%':
                totals = df.sum() * 100.0 / float(df.sum().sum())
                df = df * 100.0
                try:
                    df = df.div(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime
            elif operation == '+':
                try:
                    df = df.add(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime
            elif operation == '-':
                try:
                    df = df.sub(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime
            elif operation == '*':
                totals = df.sum() * float(df.sum().sum())
                try:
                    df = df.mul(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime
            elif operation == '/':
                try:
                    totals = df.sum() / float(df.sum().sum())
                    df = df.div(denom, axis = 0)
                except ValueError:
                    from time import localtime, strftime
                    thetime = strftime("%H:%M:%S", localtime())
                    print '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime
            elif operation == 'd':
                #df.ix['Combined total'] = df.sum()
                #to_drop = to_drop = list(df.T[df.T['Combined total'] < threshold].index)
                to_drop = [n for n in list(df.columns) if df[n].sum() < threshold]
                df = df.drop([e for e in to_drop if e in list(df.columns)], axis = 1)
                #df.drop('Combined total')
                if prinf:
                    to_show = []
                    [to_show.append(w) for w in to_drop[:5]]
                    if len(to_drop) > 10:
                        to_show.append('...')
                        [to_show.append(w) for w in to_drop[-5:]]
                    if len(to_drop) > 0:
                        print 'Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show))
                    if len(to_drop) > 10:
                        print '... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)
                    else:
                        print ''

                # get normalised num in target corpus
                norm_in_target = df.div(denom, axis = 0)
                # get normalised num in reference corpus, with or without selfdrop
                tot_in_ref = df.copy()
                for c in list(tot_in_ref.index):
                    if selfdrop:
                        tot_in_ref.ix[c] = df.sum() - tot_in_ref.ix[c]
                    else:
                        tot_in_ref.ix[c] = df.sum()
                norm_in_ref = tot_in_ref.div(df.sum().sum())
                df = (norm_in_target - norm_in_ref) / norm_in_ref * 100.0
                df = df.replace(float(-100.00), np.nan)

            elif operation == 'a':
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis = 1) / df2
            
            elif operation.startswith('c'):
                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    df = pandas.concat([df, df2], axis = 1)
            return df, totals

        elif not single_totals:
            if not operation.startswith('a'):
                # generate totals
                if operation == '%':
                    totals = df.sum() * 100.0 / float(df2.sum().sum())
                if operation == '*':
                    totals = df.sum() * float(df2.sum().sum())
                if operation == '/':
                    totals = df.sum() / float(df2.sum().sum())
                if operation.startswith('c'):
                    # add here the info that merging will not work 
                    # with identical colnames
                    import warnings
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        d = pd.concat([df.T, df2.T]).sort()
                        # make index nums
                        d = d.reset_index()
                        # sum and remove duplicates
                        d = d.groupby('index').sum()
                        dx = d.reset_index('index')
                        dx.index = list(dx['index'])
                        df = dx.drop('index', axis = 1).T

                for index, entry in enumerate(list(df.columns)):
                    #p.animate(index)
                    if operation == '%':
                        try:
                            df[entry] = df[entry] * 100.0 / df2[entry]
                        except:
                            continue
                        #df.drop(entry, axis = 1, inplace = True)
                        #df[entry] = maths_done
                    elif operation == '+':
                        try:
                            df[entry] = df[entry] + df2[entry]
                        except:
                            continue
                    elif operation == '-':
                        try:
                            df[entry] = df[entry] - df2[entry]
                        except:
                            continue
                    elif operation == '*':
                        try:
                            df[entry] = df[entry] * df2[entry]
                        except:
                            continue
                    elif operation == '/':
                        try:
                            df[entry] = df[entry] / df2[entry]
                        except:
                            continue

            else:
                for c in [c for c in list(df.columns) if int(c) > 1]:
                    df[c] = df[c] * (1.0 / int(c))
                df = df.sum(axis = 1) / df2.T.sum()

        return df, totals

    def parse_input(df, the_input):
        """turn whatever has been passed in into list of words that can 
           be used as pandas indices---maybe a bad way to go about it"""

        # FIX MERGE ERROR HERE
        parsed_input = False

        import re
        if the_input == 'all':
            the_input = r'.*'
        if type(the_input) == int:
            try:
                the_input = str(the_input)
            except:
                pass
            the_input = [the_input]
        elif type(the_input) == str or type(the_input) == unicode:
            try:
                regex = re.compile(the_input)
                parsed_input = [w for w in list(df) if re.search(regex, w)]
                return parsed_input
            except:
                the_input = [the_input]
        if type(the_input) == list:
            if type(the_input[0]) == int:
                parsed_input = [word for index, word in enumerate(list(df)) if index in the_input]
            elif type(the_input[0]) == str or type(the_input[0]) == unicode:
                try:
                    parsed_input = [word for word in the_input if word in df.columns]
                except AttributeError: # if series
                    parsed_input = [word for word in the_input if word in df.index]
        
        return parsed_input

    def synonymise(df, pos = 'n'):
        """pass a df and a pos and convert df columns to most common synonyms"""
        from nltk.corpus import wordnet as wn
        #from dictionaries.taxonomies import taxonomies
        from collections import Counter
        fixed = []
        for w in list(df.columns):
            try:
                syns = []
                for syns in wn.synsets(w, pos = pos):
                    for w in syns:
                        synonyms.append(w)
                top_syn = Counter(syns).most_common(1)[0][0]
                fixed.append(top_syn)
            except:
                fixed.append(w)
        df.columns = fixed
        return df

    def convert_spell(df, convert_to = 'US', print_info = print_info):
        """turn dataframes into us/uk spelling"""
        from dictionaries.word_transforms import usa_convert
        if print_info:
            print 'Converting spelling ... \n'
        if convert_to == 'UK':
            usa_convert = {v: k for k, v in usa_convert.items()}
        fixed = []
        for val in list(df.columns):
            try:
                fixed.append(usa_convert[val])
            except:
                fixed.append(val)
        df.columns = fixed
        return df

    def merge_duplicates(df, print_info = print_info):
        if print_info:
            print 'Merging duplicate entries ... \n'
        # now we have to merge all duplicates
        for dup in df.columns.get_duplicates():
            #num_dupes = len(list(df[dup].columns))
            temp = df[dup].sum(axis = 1)
            #df = df.drop([dup for d in range(num_dupes)], axis = 1)
            df = df.drop(dup, axis = 1)
            df[dup] = temp
        return df

    def name_replacer(df, replace_names, print_info = print_info):
        """replace entry names and merge"""
        import re        
        # double or single nest if need be
        if type(replace_names) == str:
            replace_names = [(replace_names, '')]
        if type(replace_names) != dict:
            if type(replace_names[0]) == str:
                replace_names = [replace_names]
        if type(replace_names) == dict:
            replace_names = [(v, k) for k, v in replace_names.items()]
        for to_find, replacement in replace_names:
            if print_info:
                try:
                    print 'Replacing "%s" with "%s" ...\n' % (to_find, replacement)
                except:
                    print 'Deleting "%s" from entry names ...\n' % (to_find)
            to_find = re.compile(to_find)
            try:
                replacement = replacement
            except:
                replacement = ''
            df.columns = [re.sub(to_find, replacement, l) for l in list(df.columns)]
        df = merge_duplicates(df, print_info = False)
        return df

    def just_these_entries(df, parsed_input, prinf = True):
        entries = [word for word in list(df) if word not in parsed_input]
        if prinf:
            print 'Keeping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        df = df.drop(entries, axis = 1)
        return df

    def skip_these_entries(df, parsed_input, prinf = True):
        if prinf:     
            print 'Skipping %d entries:\n    %s' % (len(parsed_input), '\n    '.join(parsed_input[:10]))
            if len(parsed_input) > 10:
                print '... and %d more ... \n' % (len(parsed_input) - 10)
            else:
                print ''
        df = df.drop(parsed_input, axis = 1)
        return df

    def newname_getter(df, parsed_input, newname = 'combine', prinf = True, merging_subcorpora = False):
        """makes appropriate name for merged entries"""
        if merging_subcorpora:
            if newname is False:
                newname = 'combine'
        if type(newname) == int:
            the_newname = list(df.columns)[newname]
        elif type(newname) == str:
            if newname == 'combine':
                if len(parsed_input) <= 3:
                    the_newname = '/'.join(parsed_input)
                elif len(parsed_input) > 3:
                    the_newname = '/'.join(parsed_input[:3]) + '...'
            else:
                the_newname = newname
        if newname is False:
            # revise this code
            import operator
            sumdict = {}
            for item in parsed_input:
                summed = sum(list(df[item]))
                sumdict[item] = summed
            the_newname = max(sumdict.iteritems(), key=operator.itemgetter(1))[0]
        if type(the_newname) != unicode:
            the_newname = unicode(the_newname, errors = 'ignore')
        return the_newname

    def merge_these_entries(df, parsed_input, the_newname, prinf = True, merging = 'entries'):
        # make new entry with sum of parsed input
        if len(parsed_input) == 0:
            import warnings
            warnings.warn('No %s could be automatically merged.\n' % merging)
        else:
            if prinf:
                print 'Merging %d %s as "%s":\n    %s' % (len(parsed_input), merging, the_newname, '\n    '.join(parsed_input[:10]))
                if len(parsed_input) > 10:
                    print '... and %d more ... \n' % (len(parsed_input) - 10)
                else:
                    print ''
        # remove old entries
        temp = sum([df[i] for i in parsed_input])
        if not multiple_merge:
            if type(df) == pandas.core.series.Series:
                df = df.drop(parsed_input)
            else:
                df = df.drop(parsed_input, axis = 1)
        df[the_newname] = temp
        return df

    def just_these_subcorpora(df, lst_of_subcorpora, prinf = True):        
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        good_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if prinf:
            print 'Keeping %d subcorpora:\n    %s' % (len(good_years), '\n    '.join(good_years[:10]))
            if len(good_years) > 10:
                print '... and %d more ... \n' % (len(good_years) - 10)
            else:
                print ''
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        return df

    def skip_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        if type(lst_of_subcorpora) == int:
            lst_of_subcorpora = [lst_of_subcorpora]
        if type(lst_of_subcorpora[0]) == int:
            lst_of_subcorpora = [str(l) for l in lst_of_subcorpora]
        bad_years = [subcorpus for subcorpus in list(df.index) if subcorpus in lst_of_subcorpora]
        if len(bad_years) == 0:
            import warnings
            warnings.warn('No subcorpora skipped.\n')
        else:
            if prinf:       
                print 'Skipping %d subcorpora:\n    %s' % (len(bad_years), '\n    '.join([str(i) for i in bad_years[:10]]))
                if len(bad_years) > 10:
                    print '... and %d more ... \n' % (len(bad_years) - 10)
                else:
                    print ''
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus in bad_years], axis = 0)
        return df

    def span_these_subcorpora(df, lst_of_subcorpora, prinf = True):
        """select only a span of numerical suborpora (first, last)"""
        non_totals = [subcorpus for subcorpus in list(df.index)]
        good_years = [subcorpus for subcorpus in non_totals if int(subcorpus) >= int(lst_of_subcorpora[0]) and int(subcorpus) <= int(lst_of_subcorpora[-1])]
        if len(lst_of_subcorpora) == 0:
            import warnings
            warnings.warn('Span not identified.\n')
        else:        
            if prinf:        
                print 'Keeping subcorpora:\n    %d--%d\n' % (int(lst_of_subcorpora[0]), int(lst_of_subcorpora[-1]))
        df = df.drop([subcorpus for subcorpus in list(df.index) if subcorpus not in good_years], axis = 0)
        # retotal needed here
        return df

    def projector(df, list_of_tuples, prinf = True):
        """project abs values"""
        if type(list_of_tuples) == list:
            tdict = {}
            for a, b in list_of_tuples:
                tdict[a] = b
            list_of_tuples = tdict
        for subcorpus, projection_value in list_of_tuples.items():
            if type(subcorpus) == int:
                subcorpus = str(subcorpus)
            df.ix[subcorpus] = df.ix[subcorpus] * projection_value
            if prinf:
                if type(projection_value) == float:
                    print 'Projection: %s * %s' % (subcorpus, projection_value)
                if type(projection_value) == int:
                    print 'Projection: %s * %d' % (subcorpus, projection_value)
        if prinf:
            print ''
        return df

    def do_stats(df):
        """do linregress and add to df"""
        try: 
            from scipy.stats import linregress
        except ImportError:
            from time import localtime, strftime
            thetime = strftime("%H:%M:%S", localtime())
            print '%s: sort type not available in this verion of corpkit.' % thetime
            return False
        #from stats.stats import linregress

        entries = []
        slopes = []
        intercepts = []
        rs = []
        ps = []
        stderrs = []
        indices = list(df.index)
        first_year = list(df.index)[0]
        try:
            x = [int(y) - int(first_year) for y in indices]
        except ValueError:
            x = range(len(indices))
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        for entry in list(df.columns):
            entries.append(entry)
            y = list(df[entry])
            slope, intercept, r, p, stderr = linregress(x, y)
            slopes.append(slope)
            intercepts.append(intercept)
            rs.append(r)
            ps.append(p)
            stderrs.append(stderr)
        sl = pd.DataFrame([slopes, intercepts, rs, ps, stderrs], 
                           index = statfields, 
                           columns = list(df.columns))
        df = df.append(sl)
        # drop infinites and nans
        if operation != 'd':
            df = df.replace([np.inf, -np.inf], np.nan)
            df = df.fillna(0.0)
        return df

    def recalc(df, operation = '%'):
        statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
        """Add totals to the dataframe1"""

        #df.drop('Total', axis = 0, inplace = True)
        #df.drop('Total', axis = 1, inplace = True)
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        try:
            df['temp-Total'] = df.drop(statfields).sum(axis = 1)
        except:
            df['temp-Total'] = df.sum(axis = 1)
        df = df.T
        return df

    def resort(df, sort_by = False, keep_stats = False):
        """sort results, potentially using scipy's linregress"""
        
        # translate options and make sure they are parseable
        options = ['total', 'name', 'infreq', 'increase', 'turbulent',
                   'decrease', 'static', 'most', 'least', 'none']

        if sort_by is True:
            sort_by = 'total'
        if sort_by == 'most':
            sort_by = 'total'
        if sort_by == 'least':
            sort_by = 'infreq'
        if sort_by not in options:
            raise ValueError("sort_by parameter error: '%s' not recognised. Must be True, False, %s" % (sort_by, ', '.join(options)))

        if operation.startswith('k'):
            if type(df) == pandas.core.series.Series:
                if sort_by == 'total':
                    df = df.order(ascending = False)

                elif sort_by == 'infreq':
                    df = df.order(ascending = True)

                elif sort_by == 'name':
                    df = df.sort_index()
                return df

        if just_totals:
            if sort_by == 'infreq':
                df = df.sort(columns = 'Combined total', ascending = True)
            elif sort_by == 'total':
                df = df.sort(columns = 'Combined total', ascending = False)
            elif sort_by == 'name':
                df = df.sort_index()
            return df

        # this is really shitty now that i know how to sort, like in the above
        if keep_stats:
            df = do_stats(df)
            if type(df) == bool:
                if df is False:
                    return False
        if sort_by == 'total':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()[::-1]]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'infreq':
            if df1_istotals:
                df = df.T
            df = recalc(df, operation = operation)
            tot = df.ix['temp-Total']
            df = df[tot.argsort()]
            df = df.drop('temp-Total', axis = 0)
            df = df.drop('temp-Total', axis = 1)
            if df1_istotals:
                df = df.T
        elif sort_by == 'name':
            # currently case sensitive...
            df = df.reindex_axis(sorted(df.columns), axis=1)
        else:
            statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
            
            if not keep_stats:
                df = do_stats(df)
                if type(df) == bool:
                    if df is False:
                        return False

            slopes = df.ix['slope']
            if sort_by == 'increase':
                df = df[slopes.argsort()[::-1]]
            elif sort_by == 'decrease':
                df = df[slopes.argsort()]
            elif sort_by == 'static':
                df = df[slopes.abs().argsort()]
            elif sort_by == 'turbulent':
                df = df[slopes.abs().argsort()[::-1]]
            if remove_above_p:
                # the easy way to do it!
                df = df.T
                df = df[df['p'] <= p]
                df = df.T

            # remove stats field by default
            if not keep_stats:
                df = df.drop(statfields, axis = 0)

        return df

    def set_threshold(big_list, threshold, prinf = True, for_keywords = False):
        if type(threshold) == str:
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500

            if type(big_list) == pandas.core.frame.DataFrame:
                tot = big_list.sum().sum()

            if type(big_list) == pandas.core.series.Series:
                tot = big_list.sum()
            the_threshold = float(tot) / float(denominator)
            #if for_keywords:
                #the_threshold = the_threshold / 2
        else:
            the_threshold = threshold
        if prinf:
            print 'Threshold: %d\n' % the_threshold
        return the_threshold

    # check if we're in concordance mode
    try:
        if list(dataframe1.columns) == ['l', 'm', 'r']:
            conc_lines = True
        else:
            conc_lines = False
    except:
        conc_lines = False

    # copy dataframe to be very safe
    try:
        df = dataframe1.copy()
    except AttributeError:
        no_good_dataframe1 = True
        while no_good_dataframe1:
            if 'interrogation' in str(type(dataframe1)):
                sel = raw_input("\nIt looks like you're trying to edit an interrogation, " \
                                  "rather than an interrogation's .results or .totals branch. You can:\n\n    a) select .results branch\n    b) select .totals branch\n    c) exit\n\nYour choice: ")
                if sel.startswith('a'):
                    try:
                        dataframe1 = dataframe1.results
                        no_good_dataframe1 = False
                    except:
                        pass
                elif sel.startswith('b'):
                    try:
                        dataframe1 = dataframe1.totals
                        no_good_dataframe1 = False
                    except:
                        pass
                else:
                    return
            else:
                raise ValueError("Thing to be edited (dataframe1) needs to be a Pandas DataFrame or Series. " \
                                  "Right now, its type is: '%s'." % type(dataframe1).__name__)

        df = dataframe1.copy()

    # make cols into strings
    try:
        df.columns = [str(c) for c in list(df.columns)]
    except:
        pass

    if operation is None:
        operation = 'None'

    # do concordance work
    if conc_lines:
        df = dataframe1.copy()

        if just_entries:
            if type(just_entries) == int:
                just_entries = [just_entries]
            if type(just_entries) == str:
                df = df[df['m'].str.contains(just_entries)]
            if type(just_entries) == list:
                if type(just_entries[0]) == str:
                    regex = re.compile(r'(?i)^(' + r'|'.join(just_entries) + r')$')
                    df = df[df['m'].str.contains(regex)]
                else:
                    df = df.ix[just_entries].reset_index(drop = True)

        if skip_entries:
            if type(skip_entries) == int:
                skip_entries = [skip_entries]
            if type(skip_entries) == str:
                df = df[~df['m'].str.contains(skip_entries)]
            if type(skip_entries) == list:
                if type(skip_entries[0]) == str:
                    regex = re.compile(r'(?i)^(' + r'|'.join(skip_entries) + r')$')
                    df = df[~df['m'].str.contains(regex)]
                else:
                    df = df.ix[[e for e in list(df.index) if e not in skip_entries]].reset_index(drop = True)

        return df

    if print_info:
        print '\n***Processing results***\n========================\n'

    df1_istotals = False
    if type(df) == pandas.core.series.Series:
        df1_istotals = True
        df = pandas.DataFrame(df)
        # if just a single result
    else:
        df = pandas.DataFrame(df)
    if operation.startswith('k'):
        if sort_by is False:
            if not df1_istotals:
                sort_by = 'turbulent'
        if df1_istotals:
            df = df.T
    
    # figure out if there's a second list
    # copy and remove totals if there is
    single_totals = True
    using_totals = False
    outputmode = False

    try:
        if dataframe2.empty is False:            
            df2 = dataframe2.copy()
            using_totals = True

            if type(df2) == pandas.core.frame.DataFrame:
                if len(df2.columns) > 1:
                    single_totals = False
                else:
                    df2 = pd.Series(df2)
                if operation == 'd':
                    df2 = df2.sum(axis = 1)
                    single_totals = True
            elif type(df2) == pandas.core.series.Series:
                single_totals = True
                #if operation == 'k':
                    #raise ValueError('Keywording requires a DataFrame for dataframe2. Use "self"?')
            else:
                raise ValueError('dataframe2 not recognised.')
    except AttributeError:
        if operation in ['k', 'd', 'a', '%', '/', '*', '-', '+']:
            dataframe2 = 'self'         
        if dataframe2 == 'self':
            outputmode = True

    if operation.startswith('a') or operation.startswith('A'):
        if list(df.columns)[0] != '0' and list(df.columns)[0] != 0:
            df = df.T
        if using_totals:
            if not single_totals:
                df2 = df2.T

    if projection:
        # projection shouldn't do anything when working with '%', remember.
        df = projector(df, projection)
        if using_totals:
            df2 = projector(df2, projection)

    if spelling:
        df = convert_spell(df, convert_to = spelling)
        df = merge_duplicates(df, print_info = False)

        if not single_totals:
            df2 = convert_spell(df2, convert_to = spelling, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not df1_istotals:
            sort_by = 'total'

    if replace_names:
        df = name_replacer(df, replace_names)
        df = merge_duplicates(df)
        if not single_totals:
            df2 = name_replacer(df2, print_info = False)
            df2 = merge_duplicates(df2, print_info = False)
        if not sort_by:
            sort_by = 'total'

    # remove old stats if they're there:
    statfields = ['slope', 'intercept', 'r', 'p', 'stderr']
    try:
        df = df.drop(statfields, axis = 0)
    except:
        pass
    if using_totals:
        try:
            df2 = df2.drop(statfields, axis = 0)
        except:
            pass

    # remove totals and tkinter order
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        try:
            df = df.drop(name, axis = ax, errors = 'ignore')
        except:
            pass
    for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]):
        try:
            df2 = df2.drop(name, axis = ax, errors = 'ignore')
        except:
            pass

    # merging: make dicts if they aren't already, so we can iterate
    if merge_entries:
        if type(merge_entries) != list:
            if type(merge_entries) == str or type(merge_entries) == unicode:
                merge_entries = {newname: merge_entries}
            # for newname, criteria    
            for name, the_input in sorted(merge_entries.items()):
                the_newname = newname_getter(df, parse_input(df, the_input), newname = name, prinf = print_info)
                df = merge_these_entries(df, parse_input(df, the_input), the_newname, prinf = print_info)
                if not single_totals:
                    df2 = merge_these_entries(df2, parse_input(df2, the_input), the_newname, prinf = False)
        else:
            for i in merge_entries:
                the_newname = newname_getter(df, parse_input(df, merge_entries), newname = newname, prinf = print_info)
                df = merge_these_entries(df, parse_input(df, merge_entries), the_newname, prinf = print_info)
                if not single_totals:
                    df2 = merge_these_entries(df2, parse_input(df2, merge_entries), the_newname, prinf = False)
    
    if merge_subcorpora:
        if type(merge_subcorpora) != dict:
            if type(merge_subcorpora) == list:
                if type(merge_subcorpora[0]) == tuple:
                    merge_subcorpora = {x: y for x, y in merge_subcorpora}
                elif type(merge_subcorpora[0]) == str or type(merge_subcorpora[0]) == unicode:
                    merge_subcorpora = {new_subcorpus_name: [x for x in merge_subcorpora]}
                elif type(merge_subcorpora[0]) == int:
                    merge_subcorpora = {new_subcorpus_name: [str(x) for x in merge_subcorpora]}
            else:
                merge_subcorpora = {new_subcorpus_name: merge_subcorpora}
        for name, the_input in sorted(merge_subcorpora.items()):
            the_newname = newname_getter(df.T, parse_input(df.T, the_input), 
                                     newname = name, 
                                     merging_subcorpora = True,
                                     prinf = print_info)
            df = merge_these_entries(df.T, parse_input(df.T, the_input), the_newname, merging = 'subcorpora', prinf = print_info).T
            if using_totals:
                df2 = merge_these_entries(df2.T, parse_input(df2.T, the_input), the_newname, merging = 'subcorpora', prinf = False).T
    

    if just_subcorpora:
        df = just_these_subcorpora(df, just_subcorpora, prinf = print_info)
        if using_totals:
            df2 = just_these_subcorpora(df2, just_subcorpora, prinf = False)
    
    if skip_subcorpora:
        df = skip_these_subcorpora(df, skip_subcorpora, prinf = print_info)
        if using_totals:
            df2 = skip_these_subcorpora(df2, skip_subcorpora, prinf = False)
    
    if span_subcorpora:
        df = span_these_subcorpora(df, span_subcorpora, prinf = print_info)
        if using_totals:
            df2 = span_these_subcorpora(df2, span_subcorpora, prinf = False)

    if just_entries:
        df = just_these_entries(df, parse_input(df, just_entries), prinf = print_info)
        if not single_totals:
            df2 = just_these_entries(df2, parse_input(df2, just_entries), prinf = False)
    if skip_entries:
        df = skip_these_entries(df, parse_input(df, skip_entries), prinf = print_info)
        if not single_totals:
            df2 = skip_these_entries(df2, parse_input(df2, skip_entries), prinf = False)

    # drop infinites and nans
    if operation != 'd':
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # make just_totals as dataframe
    just_one_total_number = False
    if just_totals:
        df = pd.DataFrame(df.sum(), columns = ['Combined total'])
        if using_totals:
            if not single_totals:
                df2 = pd.DataFrame(df2.sum(), columns = ['Combined total'])
            else:
                just_one_total_number = True
                df2 = df2.sum()

    tots = df.sum(axis = 1)

    if using_totals or outputmode:
        if not operation.startswith('k'):
            the_threshold = 0
            # set a threshold if just_totals
            if outputmode is True:
                df2 = df.T.sum()
                if not just_totals:
                    df2.name = 'Total'
                else:
                    df2.name = 'Combined total'
                using_totals = True
                single_totals = True
            if just_totals:
                if not single_totals:
                    the_threshold = set_threshold(df2, threshold, prinf = print_info)
            if operation == 'd':
                the_threshold = set_threshold(df2, threshold, prinf = print_info) 
            df, tots = combiney(df, df2, operation = operation, threshold = the_threshold, prinf = print_info)
    
    # if doing keywording...
    if operation.startswith('k'):
        from keys import keywords

        # allow saved dicts to be df2, etc
        try:
            if dataframe2 == 'self':
                df2 = df.copy()
        except TypeError:
            pass
        if type(dataframe2) == str:
            if dataframe2 != 'self':
                df2 = dataframe2
    
        else:
            the_threshold = False

        df = keywords(df, df2, 
                      selfdrop = selfdrop, 
                      threshold = threshold, 
                      printstatus = print_info,
                      editing = True,
                      calc_all = calc_all,
                      **kwargs)

        # eh?
        df = df.T
    
    # drop infinites and nans
    if operation != 'd':
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(0.0)

    # resort data
    if sort_by:
        df = resort(df, keep_stats = keep_stats, sort_by = sort_by)
        if type(df) == bool:
            if df is False:
                return 'linregress'

    if keep_top:
        if not just_totals:
            df = df[list(df.columns)[:keep_top]]
        else:
            df = df.head(keep_top)

    if just_totals:
        # turn just_totals into series:
        df = pd.Series(df['Combined total'], name = 'Combined total')

    if df1_istotals:
        if operation.startswith('k'):
            try:
                df = pd.Series(df.ix[dataframe1.name])
                df.name = '%s: keyness' % df.name
            except:
                df = df.iloc[0,:]
                df.name = 'keyness' % df.name

    # generate totals branch if not percentage results:
    # fix me
    if df1_istotals or operation.startswith('k'):
        if not just_totals:
            try:
                total = pd.Series(df['Total'], name = 'Total')
            except:
                pass
                total = 'none'
            #total = df.copy()
        else:
            total = 'none'
    else:
        # might be wrong if using division or something...
        try:
            total = df.T.sum(axis = 1)
        except:
            total = 'none'
    
    if type(tots) != pandas.core.frame.DataFrame and type(tots) != pandas.core.series.Series:
        total = df.sum(axis = 1)
    else:
        total = tots

    if type(df) == pandas.core.frame.DataFrame:
        datatype = df.ix[0].dtype
    else:
        datatype = df.dtype

    # TURN INT COL NAMES INTO STR
    try:
        df.results.columns = [str(d) for d in list(df.results.columns)]
    except:
        pass

    def add_tkt_index(df):
        if type(df) != pandas.core.series.Series:
            df = df.T
            df = df.drop('tkintertable-order', errors = 'ignore', axis = 0)
            df = df.drop('tkintertable-order', errors = 'ignore', axis = 1)
            df['tkintertable-order'] = pd.Series([index for index, data in enumerate(list(df.index))], index = list(df.index))
            df = df.T
        return df

    # while tkintertable can't sort rows
    from corpkit.tests import check_t_kinter
    tk = check_t_kinter()
    if tk:
        df = add_tkt_index(df)

    if 'df1_always_df' in kwargs.keys():
        if kwargs['df1_always_df'] is True:
            if type(df) == pandas.core.series.Series:
                df = pandas.DataFrame(df)

    #make named_tuple
    the_operation = 'none'
    if using_totals:
        the_operation = operation

    the_options = {}
    the_options['time_started'] = the_time_started
    the_options['function'] = 'editor'
    the_options['dataframe1'] = dataframe1
    the_options['operation'] = the_operation
    the_options['dataframe2'] = dataframe2
    the_options['datatype'] = datatype
    the_options['sort_by'] = sort_by
    the_options['keep_stats'] = keep_stats
    the_options['just_totals'] = just_totals
    the_options['threshold'] = threshold # can be wrong!
    the_options['just_entries'] = just_entries
    the_options['just_entries'] = just_entries
    the_options['skip_entries'] = skip_entries
    the_options['merge_entries'] = merge_entries
    the_options['newname'] = newname
    the_options['just_subcorpora'] = just_subcorpora
    the_options['skip_subcorpora'] = skip_subcorpora
    the_options['span_subcorpora'] = span_subcorpora
    the_options['merge_subcorpora'] = merge_subcorpora
    the_options['new_subcorpus_name'] = new_subcorpus_name
    the_options['projection'] = projection
    the_options['remove_above_p'] = remove_above_p
    the_options['p'] = p
    the_options['revert_year'] = revert_year
    the_options['print_info'] = print_info

    outputnames = collections.namedtuple('edited_interrogation', ['query', 'results', 'totals'])
    output = outputnames(the_options, df, total)

    #print '\nResult (sample)\n'
    if print_info:
        #if merge_entries or merge_subcorpora or span_subcorpora or just_subcorpora or \
           #just_entries or skip_entries or skip_subcorpora or printed_th or projection:
        print '***Done!***\n========================\n'
    #print df.head().T
    #print ''
    if operation.startswith('k') or just_totals or df1_istotals:
        pd.set_option('display.max_rows', 30)
    else:
        pd.set_option('display.max_rows', 15)
    pd.set_option('display.max_columns', 8)
    pd.set_option('max_colwidth',70)
    pd.set_option('display.width', 800)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('display.float_format', lambda x: '%.2f' % x)

    return output
Example #11
0
def _edit(self, *args, **kwargs):
    from corpkit.editor import editor
    return editor(self, *args, **kwargs)
Example #12
0
def _keyness(self, measure='ll', denominator='self', **kwargs):
    from corpkit.editor import editor
    return editor(self, 'k', denominator, **kwargs)
Example #13
0
def _rel(self, denominator='self', **kwargs):
    from corpkit.editor import editor
    return editor(self, '%', denominator, **kwargs)
Example #14
0
    def edit(self, *args, **kwargs):
        """Manipulate results of interrogations.

        There are a few overall kinds of edit, most of which can be combined 
        into a single function call. It's useful to keep in mind that many are 
        basic wrappers around `pandas` operations---if you're comfortable with 
        `pandas` syntax, it may be faster at times to use its syntax instead.

        :Basic mathematical operations:

        First, you can do basic maths on results, optionally passing in some 
        data to serve as the denominator. Very commonly, you'll want to get 
        relative frequencies:

        :Example: 

        >>> data = corpus.interrogate({W: r'^t'})
        >>> rel = data.edit('%', SELF)
        >>> rel.results
            ..    to  that   the  then ...   toilet  tolerant  tolerate  ton
            01 18.50 14.65 14.44  6.20 ...     0.00      0.00      0.11 0.00
            02 24.10 14.34 13.73  8.80 ...     0.00      0.00      0.00 0.00
            03 17.31 18.01  9.97  7.62 ...     0.00      0.00      0.00 0.00

        For the operation, there are a number of possible values, each of 
        which is to be passed in as a `str`:

           `+`, `-`, `/`, `*`, `%`: self explanatory

           `k`: calculate keywords

           `a`: get distance metric
        
        `SELF` is a very useful shorthand denominator. When used, all editing 
        is performed on the data. The totals are then extracted from the edited 
        data, and used as denominator. If this is not the desired behaviour, 
        however, a more specific `interrogation.results` or 
        `interrogation.totals` attribute can be used.

        In the example above, `SELF` (or `'self'`) is equivalent to:

        :Example:

        >>> rel = data.edit('%', data.totals)

        :Keeping and skipping data:

        There are four keyword arguments that can be used to keep or skip rows 
        or columns in the data:

        * `just_entries`
        * `just_subcorpora`
        * `skip_entries`
        * `skip_subcorpora`

        Each can accept different input types:

        * `str`: treated as regular expression to match
        * `list`: 

          * of integers: indices to match
          * of strings: entries/subcorpora to match

        :Example:

        >>> data.edit(just_entries=r'^fr', 
        ...           skip_entries=['free','freedom'],
        ...           skip_subcorpora=r'[0-9]')

        :Merging data:

        There are also keyword arguments for merging entries and subcorpora:

        * `merge_entries`
        * `merge_subcorpora`

        These take a `dict`, with the new name as key and the criteria as 
        value. The criteria can be a str (regex) or wordlist.

        :Example:
        
        >>> from dictionaries.wordlists import wordlists
        >>> mer = {'Articles': ['the', 'an', 'a'], 'Modals': wordlists.modals}
        >>> data.edit(merge_entries=mer)

        :Sorting:

        The `sort_by` keyword argument takes a `str`, which represents the way 
        the result columns should be ordered.

        * `increase`: highest to lowest slope value
        * `decrease`: lowest to highest slope value
        * `turbulent`: most change in y axis values
        * `static`: least change in y axis values
        * `total/most`: largest number first
        * `infreq/least`: smallest number first
        * `name`: alphabetically

        :Example:

        >>> data.edit(sort_by='increase')

        :Editing text:
        
        Column labels, corresponding to individual interrogation results, can 
        also be edited with `replace_names`.

        :param replace_names: Edit result names, then merge duplicate entries
        :type replace_names: `str`/`list of tuples`/`dict`

        If `replace_names` is a string, it is treated as a regex to delete from 
        each name. If `replace_names` is a dict, the value is the regex, and 
        the key is the replacement text. Using a list of tuples in the form 
        `(find, replacement)` allows duplicate substitution values.

        :Example:

        >>> data.edit(replace_names={r'object': r'[di]obj'})

        :param replace_subcorpus_names: Edit subcorpus names, then merge duplicates.
                                        The same as `replace_names`, but on the other axis.
        :type replace_subcorpus_names: `str`/`list of tuples`/`dict`

        :Other options:

        There are many other miscellaneous options.

        :param keep_stats: Keep/drop stats values from dataframe after sorting
        :type keep_stats: `bool`
        
        :param keep_top: After sorting, remove all but the top *keep_top* results
        :type keep_top: `int`
        
        :param just_totals: Sum each column and work with sums
        :type just_totals: `bool`
        
        :param threshold: When using results list as dataframe 2, drop values 
                          occurring fewer than n times. If not keywording, you 
                          can use:
                                
           `'high'`: `denominator total / 2500`
           
           `'medium'`: `denominator total / 5000`
           
           `'low'`: `denominator total / 10000`
                            
           If keywording, there are smaller default thresholds

        :type threshold: `int`/`bool`

        :param span_subcorpora: If subcorpora are numerically named, span all 
                                from *int* to *int2*, inclusive
        :type span_subcorpora: `tuple` -- `(int, int2)`

        :param projection: multiply results in subcorpus by n
        :type projection: tuple -- `(subcorpus_name, n)`
        :param remove_above_p: Delete any result over `p`
        :type remove_above_p: `bool`

        :param p: set the p value
        :type p: `float`
        
        :param revert_year: When doing linear regression on years, turn annual 
                            subcorpora into 1, 2 ...
        :type revert_year: `bool`
        
        :param print_info: Print stuff to console showing what's being edited
        :type print_info: `bool`
        
        :param spelling: Convert/normalise spelling:
        :type spelling: `str` -- `'US'`/`'UK'`

        :Keywording options:

        If the operation is `k`, you're calculating keywords. In this case,
        some other keyword arguments have an effect:

        :param keyword_measure: what measure to use to calculate keywords:

           `ll`: log-likelihood
           `pd': percentage difference

        type keyword_measure: `str`
        
        :param selfdrop: When keywording, try to remove target corpus from 
                         reference corpus
        :type selfdrop: `bool`
        
        :param calc_all: When keywording, calculate words that appear in either 
                         corpus
        :type calc_all: `bool`

        :returns: :class:`corpkit.interrogation.Interrogation`
        """
        from corpkit.editor import editor
        return editor(self, *args, **kwargs)
Example #15
0
def pmultiquery(path, 
    option = 'c', 
    query = 'any', 
    sort_by = 'total', 
    quicksave = False,
    num_proc = 'default', 
    function_filter = False,
    **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator if:

        a) path is a list of paths
        b) query is a dict of named queries.
    
    This function needs joblib 0.8.4 or above in order to run properly."""
    
    import collections
    import os
    import pandas
    import pandas as pd
    from collections import namedtuple
    from time import strftime, localtime
    from corpkit.interrogator import interrogator
    from corpkit.editor import editor
    from corpkit.other import save_result
    try:
        from joblib import Parallel, delayed
    except:
        raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
                         'Install with:\n\n        pip install joblib')
    import multiprocessing
    num_cores = multiprocessing.cpu_count()

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to """
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])        
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_queries / ((num_queries / num_cores) + 1)

    # are we processing multiple queries or corpora?
    # find out optimal number of cores to use.
    multiple_option = False
    multiple_corpora = False

    if type(path) != str:
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(path))
    elif type(query) != str:
        multiple_corpora = False
        num_cores = best_num_parallel(num_cores, len(query))
    elif type(function_filter) != str:
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(function_filter.keys()))

    if num_proc != 'default':
        num_cores = num_proc

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')
    
    # the options that don't change
    d = {'option': option,
         'paralleling': True,
         'function': 'interrogator'}

    # add kwargs to query
    for k, v in kwargs.items():
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora and not multiple_option:
        path = sorted(path)
        for index, p in enumerate(path):
            name = os.path.basename(p)
            a_dict = dict(d)
            a_dict['path'] = p
            a_dict['query'] = query
            a_dict['outname'] = name
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif not multiple_corpora and not multiple_option:
        import collections
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['path'] = path
            a_dict['query'] = q
            a_dict['outname'] = name
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_option:
        import collections
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict['path'] = path
            a_dict['query'] = query
            a_dict['outname'] = name
            a_dict['function_filter'] = q
            a_dict['printstatus'] = False
            ds.append(a_dict)

    time = strftime("%H:%M:%S", localtime())
    if multiple_corpora and not multiple_option:
        print ("\n%s: Beginning %d parallel corpus interrogations:\n              %s" \
           "\n          Query: '%s'" \
           "\n          Interrogating corpus ... \n" % (time, num_cores, "\n              ".join(path), query) )

    elif not multiple_corpora and not multiple_option:
        print ("\n%s: Beginning %d parallel corpus interrogations: %s" \
           "\n          Queries: '%s'" \
           "\n          Interrogating corpus ... \n" % (time, num_cores, path, "', '".join(query.values())) )

    elif multiple_option:
        print ("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
           "\n          Query: '%s'" \
           "\n          Interrogating corpus ... \n" % (time, num_cores, path, query) )

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
    res = sorted(res)

    # turn list into dict of results, make query and total branches,
    # save and return
    if not option.startswith('c'):
        out = {}
        print ''
        for (name, data), d in zip(res, ds):
            if not option.startswith('k'):
                outputnames = collections.namedtuple('interrogation', ['query', 'results', 'totals'])
                stotal = data.sum(axis = 1)
                stotal.name = u'Total'
                output = outputnames(d, data, stotal)
            else:
                outputnames = collections.namedtuple('interrogation', ['query', 'results'])
                output = outputnames(d, data)
            out[name] = output
    
        # could be wrong for unstructured corpora?
        num_diff_results = len(data)
        time = strftime("%H:%M:%S", localtime())
        print "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (time, "'\n         '".join(sorted(out.keys())))
        if quicksave:
            for k, v in out.items():
                save_result(v, k, savedir = 'data/saved_interrogations/%s' % quicksave)
        return out
    # make query and total branch, save, return
    else:
        out = pd.concat(res, axis = 1)
        out = editor(out, sort_by = sort_by, print_info = False, keep_stats = False)
        time = strftime("%H:%M:%S", localtime())
        print '\n%s: Finished! %d unique results, %d total.' % (time, len(out.results.columns), out.totals.sum())
        if quicksave:
            from corpkit.other import save_result
            save_result(out, quicksave)
        return out
Example #16
0
def interroplot(path, query):
    """Interrogates path with Tregex query, gets relative frequencies, and plots the top seven results"""
    from corpkit import interrogator, editor, plotter
    quickstart = interrogator(path, 'words', query)
    edited = editor(quickstart.results, '%', quickstart.totals, print_info = False)
    plotter(str(path), edited.results)