Exemple #1
0
def turn_input_into_counter(data, **kwargs):
    """from string (filepath) or variable, return a counter."""
    import sys
    import os
    import re
    import collections
    import pickle
    import pandas
    from corpkit.process import datareader

    dict_found = False

    if type(data) == str:
        if os.path.isdir(data):
            # get list of words
            good = datareader(data, **kwargs)
            # remove bad stuff from result
            regex_nonword_filter = re.compile("[A-Za-z]")
            data = [i for i in good if re.search(regex_nonword_filter, i)]
            return collections.Counter(data)

    while not dict_found:
        if "interrogation" in str(type(data)):
            try:
                data = data.results
            except:
                raise ValueError("Can't find .results branch of input.")

        # if passing in results, sum them
        if type(data) == pandas.core.frame.DataFrame:
            data = data.sum()

        # count sum
        if type(data) == pandas.core.series.Series:
            data = data[data != 0]
            data = collections.Counter(data.to_dict())
            dict_found = True
            return data

        # turn notmal dicts into counter
        if type(data) == dict:
            dict_found = True
            return collections.Counter(data)

        # the best case scenario:
        if type(data) == collections.Counter:
            dict_found = True
            return data

        # filepath stuff
        if type(data) == str:
            if not data.endswith(".p"):
                data = data + ".p"
            try:
                ref_corp_dict = pickle.load(open(data, "rb"))
                dict_found = True
                return ref_corp_dict
            except IOError:
                try:
                    ref_corp_dict = pickle.load(open(os.path.join("dictionaries", data), "rb"))
                    dict_found = True
                    return ref_corp_dict
                except IOError:
                    # try:
                    #    import corpkit
                    #    path_to_corpkit = os.path.dirname(corpkit.__file__)
                    #    thepath, corpkitname = os.path.split(path_to_corpkit)
                    #    dictionaries_path = os.path.join(thepath, 'dictionaries')
                    #    ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) )
                    #    dict_found = True
                    #    return ref_corp_dict
                    # except:
                    pass

            dict_of_dicts = {}
            d_for_print = []

            dicts = [f for f in os.listdir("dictionaries") if f.endswith(".p")]
            for index, d in enumerate(dicts):
                dict_of_dicts[index] = d
                d_for_print.append("    % 2d) %s" % (index, d))

            d_for_print = "\n".join(d_for_print)

            selection = raw_input(
                "\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: "
                % d_for_print
            )
            if selection.startswith("e"):
                import sys

                sys.exit()
            else:
                try:
                    data = dict_of_dicts[int(selection)]
                except:
                    print '\nInput "%s" not recognised.' % data
Exemple #2
0
def ngrams(
    data,
    clear=True,
    printstatus=True,
    n="all",
    calc_all=True,
    blacklist=False,
    split_contractions=True,
    gramsize=2,
    **kwargs
):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """

    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords

            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from corpkit.keys import keywords_and_ngrams, turn_input_into_counter
    from corpkit.process import datareader

    # loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == "all":
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print "\n%s: Generating ngrams... \n" % time

    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    # print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = "".join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    # print ' '.join(good[:500])

    # if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(
        good, stopwords=stopwords, calc_all=calc_all, show="ngrams", gramsize=gramsize, **kwargs
    )

    import pandas as pd

    out = pd.Series([s for k, s in ngrams], index=[k for k, s in ngrams])
    out.name = "ngrams"

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print "%s: Done! %d results.\n" % (time, len(list(out.index)))

    if n == "all":
        n = len(out)

    return out[:n]