Ejemplo n.º 1
0
def turn_input_into_counter(data, **kwargs):
    """from string (filepath) or variable, return a counter."""
    import sys
    import os
    import re
    import collections
    import pickle
    import pandas
    from process import datareader
    
    dict_found = False

    if type(data) == str:
        if os.path.isdir(data):
            # get list of words
            good = datareader(data, **kwargs)
            # remove bad stuff from result
            regex_nonword_filter = re.compile("[A-Za-z]")
            data = [i for i in good if re.search(regex_nonword_filter, i)]
            return collections.Counter(data)

    while not dict_found:
        if 'interrogation' in str(type(data)):
            try:
                data = data.results
            except:
                raise ValueError("Can't find .results branch of input.")

        # if passing in results, sum them
        if type(data) == pandas.core.frame.DataFrame:
            data = data.sum()

        # count sum
        if type(data) == pandas.core.series.Series:
            data = data[data != 0]
            data = collections.Counter(data.to_dict())
            dict_found = True
            return data

        # turn notmal dicts into counter
        if type(data) == dict:
            dict_found = True
            return collections.Counter(data)
        
        # the best case scenario:
        if type(data) == collections.Counter:
            dict_found = True
            return data

        # filepath stuff
        if type(data) == str:
            if not data.endswith('.p'):
                data = data + '.p'
            try:
                ref_corp_dict = pickle.load( open( data, "rb" ) )
                dict_found = True
                return ref_corp_dict
            except IOError:
                try:
                    ref_corp_dict = pickle.load( open( os.path.join('dictionaries', data), "rb" ) )
                    dict_found = True
                    return ref_corp_dict
                except IOError:
                    #try:
                    #    import corpkit
                    #    path_to_corpkit = os.path.dirname(corpkit.__file__)
                    #    thepath, corpkitname = os.path.split(path_to_corpkit)
                    #    dictionaries_path = os.path.join(thepath, 'dictionaries')
                    #    ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) )
                    #    dict_found = True
                    #    return ref_corp_dict
                    #except:
                    pass

            dict_of_dicts = {}
            d_for_print = []
            
            dicts = [f for f in os.listdir('dictionaries') if f.endswith('.p')]
            for index, d in enumerate(dicts):
                dict_of_dicts[index] = d
                d_for_print.append('    % 2d) %s' % (index, d))
            
            d_for_print = '\n'.join(d_for_print) 

            selection = input("\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: " % d_for_print)
            if selection.startswith('e'):
                import sys
                sys.exit()
            else:
                try:
                    data = dict_of_dicts[int(selection)]
                except:
                    print('\nInput "%s" not recognised.' % data)
Ejemplo n.º 2
0
def turn_input_into_counter(data, **kwargs):
    """from string (filepath) or variable, return a counter."""
    import sys
    import os
    import re
    import collections
    import pickle
    import pandas
    from process import datareader

    dict_found = False

    if type(data) == str:
        if os.path.isdir(data):
            # get list of words
            good = datareader(data, **kwargs)
            # remove bad stuff from result
            regex_nonword_filter = re.compile("[A-Za-z]")
            data = [i for i in good if re.search(regex_nonword_filter, i)]
            return collections.Counter(data)

    while not dict_found:
        if 'interrogation' in str(type(data)):
            try:
                data = data.results
            except:
                raise ValueError("Can't find .results branch of input.")

        # if passing in results, sum them
        if type(data) == pandas.core.frame.DataFrame:
            data = data.sum()

        # count sum
        if type(data) == pandas.core.series.Series:
            data = data[data != 0]
            data = collections.Counter(data.to_dict())
            dict_found = True
            return data

        # turn notmal dicts into counter
        if type(data) == dict:
            dict_found = True
            return collections.Counter(data)

        # the best case scenario:
        if type(data) == collections.Counter:
            dict_found = True
            return data

        # filepath stuff
        if type(data) == str:
            if not data.endswith('.p'):
                data = data + '.p'
            try:
                ref_corp_dict = pickle.load(open(data, "rb"))
                dict_found = True
                return ref_corp_dict
            except IOError:
                try:
                    ref_corp_dict = pickle.load(
                        open(os.path.join('dictionaries', data), "rb"))
                    dict_found = True
                    return ref_corp_dict
                except IOError:
                    #try:
                    #    import corpkit
                    #    path_to_corpkit = os.path.dirname(corpkit.__file__)
                    #    thepath, corpkitname = os.path.split(path_to_corpkit)
                    #    dictionaries_path = os.path.join(thepath, 'dictionaries')
                    #    ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) )
                    #    dict_found = True
                    #    return ref_corp_dict
                    #except:
                    pass

            dict_of_dicts = {}
            d_for_print = []

            dicts = [f for f in os.listdir('dictionaries') if f.endswith('.p')]
            for index, d in enumerate(dicts):
                dict_of_dicts[index] = d
                d_for_print.append('    % 2d) %s' % (index, d))

            d_for_print = '\n'.join(d_for_print)

            selection = input(
                "\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: "
                % d_for_print)
            if selection.startswith('e'):
                import sys
                sys.exit()
            else:
                try:
                    data = dict_of_dicts[int(selection)]
                except:
                    print('\nInput "%s" not recognised.' % data)
Ejemplo n.º 3
0
def ngrams(data,
           clear = True, 
           printstatus = True, 
           n = 'all',
           calc_all = True,
           blacklist = False,
           split_contractions = True,
           gramsize = 2,
           **kwargs):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """
    
    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords
            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from keys import keywords_and_ngrams, turn_input_into_counter
    from process import datareader

    #loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print("\n%s: Generating ngrams... \n" % time)
    
    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-\']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    #print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    #print ' '.join(good[:500])

    #if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(good, stopwords = stopwords,
                                 calc_all = calc_all, show = 'ngrams', gramsize = gramsize, **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index = [k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print('%s: Done! %d results.\n' % (time, len(list(out.index))))

    if n  == 'all':
        n = len(out)

    return out[:n]
Ejemplo n.º 4
0
def ngrams(data,
           clear=True,
           printstatus=True,
           n='all',
           calc_all=True,
           blacklist=False,
           split_contractions=True,
           gramsize=2,
           **kwargs):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """

    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords
            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from keys import keywords_and_ngrams, turn_input_into_counter
    from process import datareader

    #loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print("\n%s: Generating ngrams... \n" % time)

    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-\']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    #print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    #print ' '.join(good[:500])

    #if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(good,
                                 stopwords=stopwords,
                                 calc_all=calc_all,
                                 show='ngrams',
                                 gramsize=gramsize,
                                 **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index=[k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print('%s: Done! %d results.\n' % (time, len(list(out.index))))

    if n == 'all':
        n = len(out)

    return out[:n]