Beispiel #1
0
def check_interpreter_saved_interro():
    """
    Interpreter made a pickled result. Check it
    """
    import pandas as pd
    import shutil    
    from corpkit import load
    dat = load('test-speak-parsed-anylemma')
    shutil.rmtree('saved_interrogations')
    assert hasattr(dat, 'results')
    assert hasattr(dat, 'totals')
    assert hasattr(dat, 'query')
    assert('concordancing' in dat.results)
    rel = dat.results.T / dat.totals
    assert_equals(rel.ix[0].sum().round(2), 0.19)
Beispiel #2
0
def load_all_results(data_dir='saved_interrogations', **kwargs):
    """
    Load every saved interrogation in data_dir into a dict:

        >>> r = load_all_results()

    :param data_dir: path to saved data
    :type data_dir: str

    :returns: dict with filenames as keys
    """
    import os
    from time import localtime, strftime
    from other import load
    from process import makesafe

    root = kwargs.get('root', False)
    note = kwargs.get('note', False)

    datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \
                 and f.endswith('.p')]
    output = {}

    l = 0
    for index, f in enumerate(datafiles):
        try:
            loadname = f.replace('.p', '')
            output[loadname] = load(f, loaddir=data_dir)
            time = strftime("%H:%M:%S", localtime())
            print('%s: %s loaded as %s.' % (time, f, makesafe(loadname)))
            l += 1
        except:
            time = strftime("%H:%M:%S", localtime())
            print(
                '%s: %s failed to load. Try using load to find out the matter.'
                % (time, f))
        if note and len(datafiles) > 3:
            note.progvar.set((index + 1) * 100.0 / len(datafiles))
        if root:
            root.update()
    time = strftime("%H:%M:%S", localtime())
    print('%s: %d interrogations loaded from %s.' %
          (time, l, os.path.basename(data_dir)))
    from interrogation import Interrodict
    return Interrodict(output)
Beispiel #3
0
def load_all_results(data_dir = 'saved_interrogations', **kwargs):
    """
    Load every saved interrogation in data_dir into a dict:

        >>> r = load_all_results()

    :param data_dir: path to saved data
    :type data_dir: str

    :returns: dict with filenames as keys
    """
    import os
    from time import localtime, strftime
    from other import load
    from process import makesafe

    root = kwargs.get('root', False)
    note = kwargs.get('note', False)    
    
    datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \
                 and f.endswith('.p')]
    output = {}

    l = 0
    for index, f in enumerate(datafiles):    
        try:
            loadname = f.replace('.p', '')
            output[loadname] = load(f, loaddir = data_dir)
            time = strftime("%H:%M:%S", localtime())
            print('%s: %s loaded as %s.' % (time, f, makesafe(loadname)))
            l += 1
        except:
            time = strftime("%H:%M:%S", localtime())
            print('%s: %s failed to load. Try using load to find out the matter.' % (time, f))
        if note and len(datafiles) > 3:
            note.progvar.set((index + 1) * 100.0 / len(datafiles))
        if root:
            root.update()
    time = strftime("%H:%M:%S", localtime())
    print('%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir)))
    from interrogation import Interrodict
    return Interrodict(output)
Beispiel #4
0
def quickview(results, n = 25):
    """view top n results as painlessly as possible.

    :param results: Interrogation data
    :type results: :class:``corpkit.interrogation.Interrogation``
    :param n: Show top *n* results
    :type n: int
    :returns: None
    """

    import corpkit
    import pandas as pd
    import numpy as np
    import os
    import corpkit
    from interrogation import Interrogation, Results, Totals

    # handle dictionaries too:
    dictpath = 'dictionaries'
    savedpath = 'saved_interrogations'

    # too lazy to code this properly for every possible data type:
    if n == 'all':
        n = 9999

    dtype = corpkit.interrogation.Interrogation

    if type(results) == str:
        if os.path.isfile(os.path.join(dictpath, results)):
            import pickle
            from collections import Counter
            unpickled = pickle.load(open(os.path.join(dictpath, results), 'rb'))
            print('\nTop %d entries in %s:\n' % (n, os.path.join(dictpath, results)))
            for index, (w, f) in enumerate(unpickled.most_common(n)):
                fildex = '% 3d' % index
                print('%s: %s (n=%d)' %(fildex, w, f))
            return

        elif os.path.isfile(os.path.join(savedpath, results)):
            from corpkit import load
            print('\n%s loaded temporarily from file:\n' % results)
            results = load(results)
        else:
            raise ValueError('File %s not found in saved_interrogations or dictionaries')

    if results.__class__ == corpkit.interrogation.Results:
        if results.iloc[0,0].dtype == 'int64':
            option = 't'
        else:
            option = '%'
        the_list = list(results.columns)[:n]
        dtype = corpkit.interrogation.Results

    elif results.__class__ == corpkit.interrogation.Totals:
        if results.iloc[0].dtype == 'int64':
            option = 't'
        else:
            option = '%'
        the_list = list(results.index)[:n]
        dtype = corpkit.interrogation.Totals

    elif results.__class__ == corpkit.interrogation.Interrogation:
        if 'results' in list(results.__dict__.keys()):
            datatype = results.results.iloc[0,0].dtype
            if datatype == 'int64':
                option = 't'
            else:
                option = '%'
            if 'operation' in results.query:
                if results.query['operation'].lower().startswith('k'):
                    option = 'k'
                if results.query['operation'].lower().startswith('%'):
                    option = '%'
                if results.query['operation'].lower().startswith('/'):
                    option = '/'
            try:
                the_list = list(results.results.columns)[:n]
            except:
                the_list = list(results.results.index)[:n]
        else:
            print(results.totals)
            return
    else:
        raise ValueError('Results not recognised.')

    # get longest word length for justification
    longest = max([len(i) for i in the_list])

    for index, entry in enumerate(the_list):
        if option == 't':
            if dtype == corpkit.interrogation.Interrogation:
                to_get_from = results.results
            elif dtype == corpkit.interrogation.Results:
                to_get_from = results
            elif dtype == corpkit.interrogation.Totals:
                to_get_from = results

            tot = to_get_from[entry].sum()
            print('%s: %s (n=%d)' %(str(index).rjust(3), entry.ljust(longest), tot))
        elif option == '%' or option == '/':
            if dtype == corpkit.interrogation.Interrogation:
                to_get_from = results.totals
                tot = to_get_from[entry]
                totstr = "%.3f" % tot
                print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) 
            elif dtype == corpkit.interrogation.Results:
                print('%s: %s (%s)' %(str(index).rjust(3), entry.ljust(longest), option))
            elif dtype == corpkit.interrogation.Totals:
                tot = results[entry]
                totstr = "%.3f" % tot
                print('%s: %s (%s%%)' % (str(index).rjust(3), entry.ljust(longest), totstr)) 
        elif option == 'k':
            print('%s: %s (l/l)' %(str(index).rjust(3), entry.ljust(longest)))
        else:
            print('%s: %s' %(str(index).rjust(3), entry.ljust(longest)))
Beispiel #5
0
def quickview(results, n=25):
    """view top n results as painlessly as possible.

    :param results: Interrogation data
    :type results: :class:``corpkit.interrogation.Interrogation``
    :param n: Show top *n* results
    :type n: int
    :returns: None
    """

    import corpkit
    import pandas as pd
    import numpy as np
    import os
    import corpkit
    from interrogation import Interrogation, Results, Totals

    # handle dictionaries too:
    dictpath = 'dictionaries'
    savedpath = 'saved_interrogations'

    # too lazy to code this properly for every possible data type:
    if n == 'all':
        n = 9999

    dtype = corpkit.interrogation.Interrogation

    if type(results) == str:
        if os.path.isfile(os.path.join(dictpath, results)):
            try:
                import cPickle as pickle
            except ImportError:
                import pickle as pickle
            from collections import Counter
            unpickled = pickle.load(open(os.path.join(dictpath, results),
                                         'rb'))
            print('\nTop %d entries in %s:\n' %
                  (n, os.path.join(dictpath, results)))
            for index, (w, f) in enumerate(unpickled.most_common(n)):
                fildex = '% 3d' % index
                print('%s: %s (n=%d)' % (fildex, w, f))
            return

        elif os.path.isfile(os.path.join(savedpath, results)):
            from corpkit import load
            print('\n%s loaded temporarily from file:\n' % results)
            results = load(results)
        else:
            raise ValueError(
                'File %s not found in saved_interrogations or dictionaries')

    if results.__class__ == corpkit.interrogation.Results:
        if results.iloc[0, 0].dtype == 'int64':
            option = 't'
        else:
            option = '%'
        the_list = list(results.columns)[:n]
        dtype = corpkit.interrogation.Results

    elif results.__class__ == corpkit.interrogation.Totals:
        if results.iloc[0].dtype == 'int64':
            option = 't'
        else:
            option = '%'
        the_list = list(results.index)[:n]
        dtype = corpkit.interrogation.Totals

    elif results.__class__ == corpkit.interrogation.Interrogation:
        if 'results' in list(results.__dict__.keys()):
            datatype = results.results.iloc[0, 0].dtype
            if datatype == 'int64':
                option = 't'
            else:
                option = '%'
            if 'operation' in results.query:
                if results.query['operation'].lower().startswith('k'):
                    option = 'k'
                if results.query['operation'].lower().startswith('%'):
                    option = '%'
                if results.query['operation'].lower().startswith('/'):
                    option = '/'
            try:
                the_list = list(results.results.columns)[:n]
            except:
                the_list = list(results.results.index)[:n]
        else:
            print(results.totals)
            return
    else:
        raise ValueError('Results not recognised.')

    # get longest word length for justification
    longest = max([len(i) for i in the_list])

    for index, entry in enumerate(the_list):
        if option == 't':
            if dtype == corpkit.interrogation.Interrogation:
                to_get_from = results.results
            elif dtype == corpkit.interrogation.Results:
                to_get_from = results
            elif dtype == corpkit.interrogation.Totals:
                to_get_from = results

            tot = to_get_from[entry].sum()
            print('%s: %s (n=%d)' %
                  (str(index).rjust(3), entry.ljust(longest), tot))
        elif option == '%' or option == '/':
            if dtype == corpkit.interrogation.Interrogation:
                to_get_from = results.totals
                tot = to_get_from[entry]
                totstr = "%.3f" % tot
                print('%s: %s (%s%%)' %
                      (str(index).rjust(3), entry.ljust(longest), totstr))
            elif dtype == corpkit.interrogation.Results:
                print('%s: %s (%s)' %
                      (str(index).rjust(3), entry.ljust(longest), option))
            elif dtype == corpkit.interrogation.Totals:
                tot = results[entry]
                totstr = "%.3f" % tot
                print('%s: %s (%s%%)' %
                      (str(index).rjust(3), entry.ljust(longest), totstr))
        elif option == 'k':
            print('%s: %s (l/l)' % (str(index).rjust(3), entry.ljust(longest)))
        else:
            print('%s: %s' % (str(index).rjust(3), entry.ljust(longest)))