コード例 #1
0
ファイル: corpus.py プロジェクト: javelir/corpkit
    def features(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.features
            SB  Characters  Tokens  Words  Closed class words  Open class words  Clauses
            01       26873    8513   7308                4809              3704     2212
            02       25844    7933   6920                4313              3620     2270
            03       18376    5683   4877                3067              2616     1640
            04       20066    6354   5366                3587              2767     1775

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-features.p')):
            try:
                return load(self.name + '-features').results
            except AttributeError:
                return load(self.name + '-features')
        else:
            feat = interrogator(self, 's', 'any').results
            if isdir(savedir):
                feat.save(self.name + '-features')
            return feat
コード例 #2
0
ファイル: other.py プロジェクト: agogear/corpkit
def multiquery(corpus, query, sort_by = 'total', quicksave = False):
    """Creates a named tuple for a list of named queries to count.

    Pass in something like:

    [[u'NPs in corpus', r'NP'], [u'VPs in corpus', r'VP']]"""

    import collections
    import os
    import pandas
    import pandas as pd
    from time import strftime, localtime
    from corpkit.interrogator import interrogator
    from corpkit.editor import editor

    if quicksave:
        savedir = 'data/saved_interrogations'
        if not quicksave.endswith('.p'):
            quicksave = quicksave + '.p'
        fullpath = os.path.join(savedir, quicksave)
        while os.path.isfile(fullpath):
            selection = raw_input("\nSave error: %s already exists in %s.\n\nPick a new name: " % (savename, savedir))
            if not selection.endswith('.p'):
                selection = selection + '.p'
                fullpath = os.path.join(savedir, selection)

    results = []
    for name, pattern in query:
        result = interrogator(corpus, 'count', pattern)
        result.totals.name = name # rename count
        results.append(result.totals)
    results = pd.concat(results, axis = 1)

    results = editor(results, sort_by = sort_by, print_info = False, keep_stats = False)
    time = strftime("%H:%M:%S", localtime())
    print '%s: Finished! %d unique results, %d total.' % (time, len(results.results.columns), results.totals.sum())
    if quicksave:
        from corpkit.other import save_result
        save_result(results, quicksave)
    return results
コード例 #3
0
ファイル: corpus.py プロジェクト: javelir/corpkit
    def wordclasses(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.wordclasses
            SB   Verb  Noun  Preposition   Determiner ...
            01  26873  8513         7308         5508 ...
            02  25844  7933         6920         3323 ...
            03  18376  5683         4877         3137 ...
            04  20066  6354         5366         4336 ...

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-wordclasses.p')):
            try:
                return load(self.name + '-wordclasses').results
            except AttributeError:
                return load(self.name + '-wordclasses')
        elif isfile(join(savedir, self.name + '-postags.p')):
            try:
                posdata = load(self.name + '-postags').results
            except AttributeError:
                posdata = load(self.name + '-postags')
            return posdata.edit(
                merge_entries=mergetags,
                sort_by='total').results
        else:
            feat = interrogator(self, 't', 'any', show='pl').results
            if isdir(savedir):
                feat.save(self.name + '-wordclasses')
            return feat
コード例 #4
0
ファイル: corpus.py プロジェクト: javelir/corpkit
    def postags(self):
        """
        Generate and show basic stats from the corpus, including number of 
        sentences, clauses, process types, etc.

        :Example:

        >>> corpus.postags
            SB      NN     VB     JJ     IN     DT 
            01   26873   8513   7308   4809   3704  ...
            02   25844   7933   6920   4313   3620  ...
            03   18376   5683   4877   3067   2616  ...
            04   20066   6354   5366   3587   2767  ...

        """
        import os
        from os.path import isfile, isdir, join
        from corpkit.interrogator import interrogator
        from corpkit.other import load
        from corpkit.dictionaries import mergetags

        savedir = 'saved_interrogations'
        if isfile(join(savedir, self.name + '-postags.p')):
            try:
                return load(self.name + '-postags').results
            except AttributeError:
                return load(self.name + '-postags')
        else:
            feat = interrogator(self, 't', 'any', show='p').results
            if isdir(savedir):
                feat.save(self.name + '-postags')
                wordclss = feat.edit(
                    merge_entries=mergetags,
                    sort_by='total').results
                wordclss.save(self.name + '-wordclasses')
            return feat
コード例 #5
0
ファイル: multiprocess.py プロジェクト: kareem180/corpkit
def pmultiquery(corpus, 
    search,
    show = 'words',
    query = 'any', 
    sort_by = 'total', 
    quicksave = False,
    multiprocess = 'default', 
    function_filter = False,
    just_speakers = False,
    root = False,
    note = False,
    print_info = True,
    **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) just speakers == 'each', or a list of speakers with len(list) > 1
    
    This function needs joblib 0.8.4 or above in order to run properly.
    There's no reason to call it yourself."""
    
    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.editor import editor
    from corpkit.other import save
    from corpkit.interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except:
        pass
        #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif hasattr(query, '__iter__'):
        multiple_queries = True
        num_cores = best_num_parallel(num_cores, len(query))
        denom = len(query)
    elif hasattr(search, '__iter__') and type(search) != dict:
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(search.keys()))
        denom = len(search.keys())
    elif hasattr(function_filter, '__iter__'):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(function_filter.keys()))
        denom = len(function_filter.keys())
    elif just_speakers:
        from corpkit.build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print 'No speaker name data found.'
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)
        
    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')
    
    # the options that don't change
    d = {
         #'paralleling': True,
         'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in kwargs.items():
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['function_filter'] = q
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, val in enumerate(search):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    time = strftime("%H:%M:%S", localtime())
    sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in search.items()])
    if multiple_corpora and not multiple_option:
        print ("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s" \
           "\n          Query: '%s'\n"  % (time, len(corpus), num_cores, "\n              ".join([i.name for i in corpus]), sformat))

    elif multiple_queries:
        print ("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n" % (time, len(search), num_cores, corpus.name, "', '".join(search.values())) )

    elif multiple_search:
        print ("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n" % (time, len(search.keys()), num_cores, corpus.name, str(search.values())))

    elif multiple_option:
        print ("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
           "\n          Query: '%s'\n" % (time, num_cores, corpus.name, sformat) )

    elif multiple_speakers:
        print ("\n%s: Beginning %d parallel corpus interrogations: %s" \
           "\n          Query: '%s'\n" % (time, num_cores, corpus.name, sformat) )

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root:
        from blessings import Terminal
        terminal = Terminal()
        print '\n' * (len(ds) - 2)
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print '%s: QUEUED: %s' % (thetime, dobj['outname'])

            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print 'Multiprocessing failed.'
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    #from multiprocessing import Process
    #from corpkit.interrogator import interrogator
    #jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    #result_queue = multiprocessing.Queue()
    #
    #for d in ds:
    #funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    #jobs = [multiprocessing.Process(mc) for mc in funs]
    #for job in jobs: job.start()
    #for job in jobs: job.join()
    #results = [result_queue.get() for mc in funs]

    # turn list into dict of results, make query and total branches,
    # save and return
    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = {}
        for interrog, d in zip(res, ds):
            interrog.query = d
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            out[interrog.query['outname']] = interrog
    
        # could be wrong for unstructured corpora?
        if quicksave:
            fullpath = os.path.join('saved_interrogations', quicksave)
            while os.path.isdir(fullpath):
                selection = raw_input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations'))
                if selection == 'o' or selection == 'O':
                    import shutil
                    shutil.rmtree(fullpath)
                else:
                    import os
                    fullpath = os.path.join('saved_interrogations', selection)

            for k, v in out.items():
                save(v, k, savedir = fullpath, print_info = False)
        
            time = strftime("%H:%M:%S", localtime())
            print "\n%s: %d files saved to %s" % ( time, len(out.keys()), fullpath)

        time = strftime("%H:%M:%S", localtime())
        print "\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (time, "'\n         '".join(sorted(out.keys())))
        from corpkit.interrogation import Interrodict
        return Interrodict(out)
    # make query and total branch, save, return
    else:
        #print sers
        #print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index = [d['outname'] for d in ds])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T
        else:
            out = pd.concat([r.results for r in res], axis = 1)
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix['Total-tmp'] = out.sum()
            tot = out.ix['Total-tmp']
            out = out[tot.argsort()[::-1]]
            out = out.drop('Total-tmp', axis = 0)

        out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            with terminal.location(0, terminal.height):
                print '\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')
        else:
            print '\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n')
        if used_joblib:
            print '\n' * (len(ds) - 3)
        if quicksave:
            from corpkit.other import save
            save(out, quicksave)
        return out
コード例 #6
0
ファイル: multiprocess.py プロジェクト: mrinalgrover/corpkit
def pmultiquery(corpus, 
                search,
                show='words',
                query='any', 
                sort_by='total', 
                save=False,
                multiprocess='default', 
                root=False,
                note=False,
                print_info=True,
                subcorpora=False,
                **kwargs
               ):
    """
    - Parallel process multiple queries or corpora.
    - This function is used by corpkit.interrogator.interrogator()
    - for multiprocessing.
    - There's no reason to call this function yourself.
    """
    import os
    from pandas import DataFrame, Series
    import pandas as pd
    import collections
    from collections import namedtuple, OrderedDict
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.process import canpickle
    try:
        from joblib import Parallel, delayed
    except ImportError:
        pass
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v
    in_notebook = locs.get('in_notebook')

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        import corpkit
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) \
                               if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple = kwargs.get('multiple', False)
    mult_corp_are_subs = False
    if hasattr(corpus, '__iter__'):
        if all(getattr(x, 'level', False) == 's' for x in corpus):
            mult_corp_are_subs = True

    non_first_sub = None
    if subcorpora:
        non_first_sub = subcorpora[1:] if isinstance(subcorpora, list) else None
        subval = subcorpora if not non_first_sub else subcorpora[0]
        #print(subcorpora, non_first_sub, subval)
        if subcorpora is True:
            import re
            subcorpora = re.compile(r'.*')
        else:
            # strange travis error happened here
            subcorpora = corpus.metadata['fields'][subval]
            if len(subcorpora) == 0:
                print('No %s metadata found.' % str(subval))
                return

    mapcores = {'datalist': [corpus, 'corpus'],
                'multiplecorpora': [corpus, 'corpus'],
                'namedqueriessingle': [query, 'query'],
                'namedqueriesmultiple': [search, 'search'],
                'subcorpora': [subcorpora, 'subcorpora']}

    # a is a dummy, just to produce default one
    toiter, itsname = mapcores.get(multiple, [False, False])
    if isinstance(toiter, dict):
        toiter = toiter.items()
    denom = len(toiter)
    num_cores = best_num_parallel(num_cores, denom)

    # todo: code below makes no sense
    vals = ['eachspeaker', 'multiplespeaker', 'namedqueriesmultiple']
    if multiple == 'multiplecorpora' and any(x is True for x in vals):
        from corpkit.corpus import Corpus, Corpora
        if isinstance(corpus, Corpora):
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if isinstance(multiprocess, int):
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure saves are right type
    if save is True:
        raise ValueError('save must be string when multiprocessing.')

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    locs['printstatus'] = False
    locs['multiprocess'] = False
    locs['df1_always_df'] = False
    locs['files_as_subcorpora'] = False
    locs['corpus'] = corpus

    if multiple == 'multiplespeaker':
        locs['multispeaker'] = True

    if isinstance(non_first_sub, list) and len(non_first_sub) == 1:
        non_first_sub = non_first_sub[0]

    # make the default query
    locs = {k: v for k, v in locs.items() if canpickle(v)}
    # make a new dict for every iteration
    ds = [dict(**locs) for i in range(denom)]
    for index, (d, bit) in enumerate(zip(ds, toiter)):
        d['paralleling'] = index
        if multiple in ['namedqueriessingle', 'namedqueriesmultiple']:
            d[itsname] = bit[1]
            d['outname'] = bit[0]
        elif multiple in ['multiplecorpora', 'datalist']:
            d['outname'] = bit.name.replace('-parsed', '')
            d[itsname] = bit
        elif multiple in ['subcorpora']:
            d[itsname] = bit
            jmd = {subval: bit}
            # put this earlier
            j2 = kwargs.get('just_metadata', False)
            if not j2:
                j2 = {}
            jmd.update(j2)
    
            d['just_metadata'] = jmd
            d['outname'] = bit
            d['by_metadata'] = False
            d['subcorpora'] = non_first_sub
            if non_first_sub:
                d['print_info'] = False

    # message printer should be a function...
    if kwargs.get('conc') is False:
        message = 'Interrogating'
    elif kwargs.get('conc') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('conc').lower() == 'only':
        message = 'Concordancing'

    time = strftime("%H:%M:%S", localtime())
    from corpkit.process import dictformat
    
    if print_info:

        # proper printing for plurals
        # in truth this needs to be revised, it's horrible.
        sformat = dictformat(search, query)

        if num_cores == 1:
            add_es = ''
        else:
            add_es = 'es'
        if multiple in ['multiplecorpora', 'datalist']:
            corplist = "\n              ".join([i.name for i in list(corpus)[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, add_es, corplist, sformat, message)))

        elif multiple == 'namedqueriessingle':
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores,  add_es, corpus.name, sformat, message) ))

        elif multiple == 'namedqueriesmultiple':
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message)))

        elif multiple in ['eachspeaker', 'multiplespeaker']:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))
        elif multiple in ['subcorpora']:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    #todo: the number of blank lines to print can be way wrong
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))
            except:
                pass

    if not root and multiprocess:
        try:
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted([i for i in res if i])
        except:
            pass

    # remove unpicklable bits from query
    from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
    badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType)
    qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)}

    if hasattr(qlocs.get('corpus', False), 'name'):
        qlocs['corpus'] = qlocs['corpus'].path
    else:
        qlocs['corpus'] = list([i.path for i in qlocs.get('corpus', [])])

    # return just a concordance
    from corpkit.interrogation import Concordance
    if kwargs.get('conc') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        concs = concs.reset_index(drop=True)
        if kwargs.get('maxconc'):
            concs = concs[:kwargs.get('maxconc')]
        lines = Concordance(concs)
        
        if save:
            lines.save(save, print_info=print_info)

        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' % (thetime, format(len(concs.index), ',')))

        return lines

    # return interrodict (to become multiindex)
    if isinstance(res[0], Interrodict) or not all(isinstance(i.results, Series) for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        idict = Interrodict(out)
        
        if print_info:
            thetime = strftime("%H:%M:%S", localtime())
            print("\n\n%s: Finished! Output is multi-indexed." % thetime)
        idict.query = qlocs

        if save:
            idict.save(save, print_info=print_info)

        return idict

    # make query and total branch, save, return
    # todo: standardise this so we don't have to guess transposes
    # 
    else:
        if multiple == 'multiplecorpora' and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            # make a series from counts
            if all(len(i.results) == 1 for i in res):
                out = pd.concat([r.results for r in res])
                out = out.sort_index()
            else:
                try:
                    out = pd.concat([r.results for r in res], axis=1)
                    out = out.T
                    out.index = [i.query['outname'] for i in res]
                except ValueError:
                    return None
                # format like normal
                # this sorts subcorpora, which are cls
                out = out[sorted(list(out.columns))]
                # puts subcorpora in the right place
                if not mult_corp_are_subs and multiple != 'subcorpora':
                    out = out.T
                if multiple == 'subcorpora':
                    out = out.sort_index()
                out = out.fillna(0) # nan to zero
                out = out.astype(int)
                if 'c' in show and mult_corp_are_subs:
                    out = out.sum()
                    out.index = sorted(list(out.index))

        # sort by total
        if isinstance(out, DataFrame):

            out = out[list(out.sum().sort_values(ascending=False).index)]

            # really need to figure out the deal with tranposing!
            if all(x.endswith('.xml') for x in list(out.columns)) \
            or all(x.endswith('.txt') for x in list(out.columns)) \
            or all(x.endswith('.conll') for x in list(out.columns)):
                out = out.T
                
            if kwargs.get('nosubmode'):
                out = out.sum()
    
        from corpkit.interrogation import Interrogation
        tt = out.sum(axis=1) if isinstance(out, DataFrame) else out.sum()
        out = Interrogation(results=out, totals=tt, query=qlocs)

        if hasattr(out, 'columns') and len(out.columns) == 1:
            out = out.sort_index()   

        if kwargs.get('conc') is True:
            try:
                concs = pd.concat([x.concordance for x in res], ignore_index=True)
                concs = concs.sort_values(by='c')
                concs = concs.reset_index(drop=True)
                if kwargs.get('maxconc'):
                    concs = concs[:kwargs.get('maxconc')]
                out.concordance = Concordance(concs)
            except ValueError:
                out.concordance = None

        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            print(terminal.move(terminal.height-1, 0))
        if print_info:
            if terminal:
                print(terminal.move(terminal.height-1, 0))
            if hasattr(out.results, 'columns'):
                print('%s: Interrogation finished! %s unique results, %s total.' % (thetime, format(len(out.results.columns), ','), format(out.totals.sum(), ',')))
            else:
                print('%s: Interrogation finished! %s matches.' % (thetime, format(tt, ',')))
        if save:
            out.save(save, print_info = print_info)

        if list(out.results.index) == ['0'] and not kwargs.get('df1_always_df'):
            out.results = out.results.ix[0].sort_index()
        return out
コード例 #7
0
ファイル: other.py プロジェクト: agogear/corpkit
def interroplot(path, query):
    """Interrogates path with Tregex query, gets relative frequencies, and plots the top seven results"""
    from corpkit import interrogator, editor, plotter
    quickstart = interrogator(path, 'words', query)
    edited = editor(quickstart.results, '%', quickstart.totals, print_info = False)
    plotter(str(path), edited.results)
コード例 #8
0
ファイル: corpus.py プロジェクト: javelir/corpkit
 def concordance(self, *args, **kwargs):
     """
     Concordance the corpus using :func:`~corpkit.corpus.Corpus.concordance`
     """
     from corpkit.interrogator import interrogator
     return interrogator(self, conc='only', *args, **kwargs)
コード例 #9
0
ファイル: corpus.py プロジェクト: javelir/corpkit
 def interrogate(self, *args, **kwargs):
     """
     Interrogate the corpus using :func:`~corpkit.corpus.Corpus.interrogate`
     """
     from corpkit.interrogator import interrogator
     return interrogator(self, *args, **kwargs)
コード例 #10
0
ファイル: corpus.py プロジェクト: javelir/corpkit
    def interrogate(self, search, *args, **kwargs):
        """
        Interrogate a corpus of texts for a lexicogrammatical phenomenon.

        This method iterates over the files/folders in a corpus, searching the
        texts, and returning a :class:`corpkit.interrogation.Interrogation`
        object containing the results. The main options are `search`, where you
        specify search criteria, and `show`, where you specify what you want to
        appear in the output.

        :Example:

        >>> corpus = Corpus('data/conversations-parsed')
        ### show lemma form of nouns ending in 'ing'
        >>> q = {W: r'ing$', P: r'^N'}
        >>> data = corpus.interrogate(q, show=L)
        >>> data.results
            ..  something  anything  thing  feeling  everything  nothing  morning
            01         14        11     12        1           6        0        1
            02         10        20      4        4           8        3        0
            03         14         5      5        3           1        0        0
            ...                                                               ...

        :param search: What part of the lexicogrammar to search, and what 
                       criteria to match. The `keys` are the thing to be 
                       searched, and values are the criteria. To search parse 
                       trees, use the `T` key, and a Tregex query as the value.
                       When searching dependencies, you can use any of:

                       +--------------------+-------+----------+-----------+-----------+
                       |                    | Match | Governor | Dependent | Head      |
                       +====================+=======+==========+===========+===========+
                       | Word               | `W`   | `G`      | `D`       | `H`       |
                       +--------------------+-------+----------+-----------+-----------+
                       | Lemma              | `L`   | `GL`     | `DL`      | `HL`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Function           | `F`   | `GF`     | `DF`      | `HF`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | POS tag            | `P`   | `GP`     | `DP`      | `HP`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Word class         | `X`   | `GX`     | `DX`      | `HX`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Distance from root | `R`   | `GR`     | `DR`      | `HR`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Index              | `I`   | `GI`     | `DI`      | `HI`      |
                       +--------------------+-------+----------+-----------+-----------+
                       | Sentence index     | `S`   | `SI`     | `SI`      | `SI`      |
                       +--------------------+-------+----------+-----------+-----------+

                       Values should be regular expressions or wordlists to 
                       match.

        :type search: `dict`

        :Example:

        >>> corpus.interrogate({T: r'/NN.?/' < /^t/'}) # T- nouns, via trees
        >>> corpus.interrogate({W: '^t': P: r'^v'}) # T- nouns, via dependencies

        :param searchmode: Return results matching any/all criteria
        :type searchmode: `str` -- `'any'`/`'all'`

        :param exclude: The inverse of `search`, removing results from search
        :type exclude: `dict` -- `{L: 'be'}`

        :param excludemode: Exclude results matching any/all criteria
        :type excludemode: `str` -- `'any'`/`'all'`

        :param query: A search query for the interrogation. This is only used
                      when `search` is a `str`, or when multiprocessing. When 
                      `search` If `search` is a str, the search criteria can be 
                      passed in as `query, in order to allow the simpler syntax:

                         >>> corpus.interrogate(GL, '(think|want|feel)')

                      When multiprocessing, the following is possible:

                         >>> {'Nouns': r'/NN.?/', 'Verbs': r'/VB.?/'}
                         ### return an :class:`corpkit.interrogation.Interrodict` object:
                         >>> corpus.interrogate(T, q)
                         ### return an :class:`corpkit.interrogation.Interrogation` object:
                         >>> corpus.interrogate(T, q, show=C)

        :type query: `str`, `dict` or `list`

        :param show: What to output. If multiple strings are passed in as a `list`, 
                     results will be colon-separated, in the suppled order. Possible 
                     values are the same as those for `search`, plus options 
                     n-gramming and getting collocates:

                     +------+-----------------------+------------------------+
                     | Show | Gloss                 | Example                |
                     +======+=======================+========================+
                     | N    |  N-gram word          | `The women were`       |
                     +------+-----------------------+------------------------+
                     | NL   |  N-gram lemma         | `The woman be`         |
                     +------+-----------------------+------------------------+
                     | NF   |  N-gram function      | `det nsubj root`       |
                     +------+-----------------------+------------------------+
                     | NP   |  N-gram POS tag       | `DT NNS VBN`           |
                     +------+-----------------------+------------------------+
                     | NX   |  N-gram word class    | `determiner noun verb` |
                     +------+-----------------------+------------------------+
                     | B    |  Collocate word       | `The_were`             |
                     +------+-----------------------+------------------------+
                     | BL   |  Collocate lemma      | `The_be`               |
                     +------+-----------------------+------------------------+
                     | BF   |  Collocate function   | `det_root`             |
                     +------+-----------------------+------------------------+
                     | BP   |  Collocate POS tag    | `DT_VBN`               |
                     +------+-----------------------+------------------------+
                     | BX   |  Collocate word class | `determiner_verb`      |
                     +------+-----------------------+------------------------+

        :type show: `str`/`list` of strings

        :param lemmatise: Force lemmatisation on results. **Deprecated:
                          instead, output a lemma form with the `show` argument**
        :type lemmatise: `bool`

        :param lemmatag: Explicitly pass a POS to lemmatiser (generally when data
                         is unparsed, or when tag cannot be recovered from Tregex query)
        :type lemmatag: `'n'`/`'v'`/`'a'`/`'r'`/`False`

        :param spelling: Convert all to U.S. or U.K. English
        :type spelling: `False`/`'US'`/`'UK'`

        :param dep_type: The kind of Stanford CoreNLP dependency parses you want
                         to use: `'basic-dependencies'`, `'collapsed-dependencies'`,
                         or `'collapsed-ccprocessed-dependencies'`.

        :param save: Save result as pickle to `saved_interrogations/<save>` on 
                     completion
        :type save: `str`

        :param gramsize: Size of n-grams (default 2)
        :type gramsize: `int`

        :param split_contractions: Make `"don't"` et al into two tokens
        :type split_contractions: `bool`

        :param multiprocess: How many parallel processes to run
        :type multiprocess: `int`/`bool` (`bool` determines automatically)

        :param files_as_subcorpora: Treat each file as a subcorpus, ignoring 
                                    actual subcorpora if present
        :type files_as_subcorpora: `bool`

        :param conc: Generate a concordance while interrogating, 
                                 store as `.concordance` attribute
        :type conc: `bool`/`'only'`

        :param coref: Allow counting of pronominal referents
        :type coref: `bool`

        :param representative: Allow copula coreference matching
        :type representative: `bool`

        :param representative: Allow non-copula coreference matching
        :type representative: `bool`        

        :param tgrep: Use `TGrep` for tree querying. TGrep is less expressive 
                      than Tregex, and is slower, but can work without Java.
        :type tgrep: `bool`

        :param just_speakers: Limit search to paricular speakers. If 'each',
                              generate :class:`corpkit.interrogation.Interrodict`
                              for each speaker. If a `list` of speaker names, 
                              generate :class:`corpkit.interrogation.Interrodict`
                              for each named speaker. If compiled regular expression,
                              generate :class:`corpkit.interrogation.Interrogation`
                              with each speaker matching the regex conflated.
        :type just_speakers: `str`/`each`/`list`/`regex`

        :returns: A :class:`corpkit.interrogation.Interrogation` object, with 
                  `.query`, `.results`, `.totals` attributes. If multiprocessing is 
                  invoked, result may be a :class:`corpkit.interrogation.Interrodict` 
                  containing corpus names, queries or speakers as keys.
        """
        from corpkit.interrogator import interrogator
        par = kwargs.pop('multiprocess', None)
        kwargs.pop('corpus', None)
        if par and self.subcorpora:
            if isinstance(par, int):
                kwargs['multiprocess'] = par
            return interrogator(self.subcorpora, search, *args, **kwargs)
        else:
            kwargs['multiprocess'] = par
            return interrogator(self, search, *args, **kwargs)
コード例 #11
0
ファイル: multiprocess.py プロジェクト: javelir/corpkit
def pmultiquery(corpus, 
                search,
                show='words',
                query='any', 
                sort_by='total', 
                save=False,
                multiprocess='default', 
                just_speakers=False,
                root=False,
                note=False,
                print_info=True,
                **kwargs
               ):
    """
    - Parallel process multiple queries or corpora.
    - This function is used by corpkit.interrogator.interrogator()
    - for multiprocessing.
    - There's no reason to call this function yourself."""
    import os
    from pandas import DataFrame, Series
    import pandas as pd
    import collections
    from collections import namedtuple, OrderedDict
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except ImportError:
        pass
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v
    in_notebook = locs.get('in_notebook')

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        import corpkit
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) \
                               if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (isinstance(query, (list, dict)) and not hasattr(search, '__iter__')):
            multiple_queries = True
            num_cores = best_num_parallel(num_cores, len(query))
            denom = len(query)
    elif hasattr(search, '__iter__') and all(isinstance(i, dict) for i in list(search.values())):
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))

    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    if multiple_corpora and any(x is True for x in [multiple_speakers, multiple_queries, 
                                                    multiple_search, multiple_option]):
        from corpkit.corpus import Corpus, Corpora
        if isinstance(corpus, Corpora):
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if isinstance(multiprocess, int):
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure saves are right type
    if save is True:
        raise ValueError('save must be string when multiprocessing.')
    
    # the options that don't change
    d = {'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, (name, val) in enumerate(search.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('conc') is False:
        message = 'Interrogating'
    elif kwargs.get('conc') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('conc').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    if multiple_queries:
        to_it_over = query
    else:
        to_it_over = search
    for i, (k, v) in enumerate(list(to_it_over.items())):
        if isinstance(v, list):
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        elif isinstance(v, dict):
            vformat = ''
            for kk, vv in v.items():
                if isinstance(vv, list):
                    vv = ', '.join(vv[:5])

                vformat += '\n                     %s: %s' % (kk, vv)
                if len(vv) > 5:
                    vformat += ' ...'
        else:
            try:
                vformat = v.pattern
            except AttributeError:
                vformat = v
        sformat += '%s: %s' %(k, vformat)
        if i < len(to_it_over.keys()) - 1:
            sformat += '\n                   '

    if print_info:
        # proper printing for plurals
        # in truth this needs to be revised, it's horrible.
        if num_cores == 1:
            add_es = ''
        else:
            add_es = 'es'
        if multiple_corpora and not multiple_option:
            corplist = "\n              ".join([i.name for i in corpus[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, add_es, corplist, sformat, message)))

        elif multiple_queries:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores,  add_es, corpus.name, sformat, message) ))

        elif multiple_search:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message)))

        elif multiple_option:
            print(("\n%s: Beginning %d parallel corpus interrogation%s (multiple options): %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat,  message) ))

        elif multiple_speakers:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))
            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted([i for i in res if i])
        except:
            pass

    # remove unpicklable bits from query
    from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
    badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType)
    qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)}

    if hasattr(qlocs['corpus'], 'name'):
        qlocs['corpus'] = qlocs['corpus'].path
    else:
        qlocs['corpus'] = list([i.path for i in qlocs['corpus']])

    from corpkit.interrogation import Concordance
    if kwargs.get('conc') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        concs = concs.reset_index(drop=True)
        lines = Concordance(concs)
        
        if save:
            lines.save(save, print_info=print_info)

        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index)))

        return lines

    if not all(isinstance(i.results, Series) for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        from corpkit.interrogation import Interrodict
        idict = Interrodict(out)
        
        if print_info:
            time = strftime("%H:%M:%S", localtime())
            print("\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % \
                (time, "'\n         '".join(sorted(out.keys()))))

        idict.query = qlocs

        if save:
            idict.save(save, print_info=print_info)

        return idict
    

    # make query and total branch, save, return
    # todo: standardise this so we don't have to guess transposes
    else:
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            try:
                out = pd.concat([r.results for r in res], axis=1)
                out = out.T
                out.index = [i.query['outname'] for i in res]
            except ValueError:
                return None
            # format like normal
            # this sorts subcorpora, which are cls
            out = out[sorted(list(out.columns))]
            # puts subcorpora in the right place
            if not mult_corp_are_subs:
                out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if isinstance(out, DataFrame):
            out = out[list(out.sum().sort_values(ascending=False).index)]

            # really need to figure out the deal with tranposing!
            if all(x.endswith('.xml') for x in list(out.columns)) \
            or all(x.endswith('.txt') for x in list(out.columns)):
                out = out.T
        out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, \
                      df1_always_df=kwargs.get('df1_always_df'))
        out.query = qlocs

        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()   
        if kwargs.get('conc') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index=True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal and print_info:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        else:
            if print_info:
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        if save:
            out.save(save, print_info = print_info)
        return out