Python datareader Examples

Programming Language: Python

Namespace/Package Name: corpkit.process

Method/Function: datareader

Examples at hotexamples.com: 2

Python datareader - 2 examples found. These are the top rated real world Python examples of corpkit.process.datareader extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: keys.py Project: kareem180/corpkit

def turn_input_into_counter(data, **kwargs):
    """from string (filepath) or variable, return a counter."""
    import sys
    import os
    import re
    import collections
    import pickle
    import pandas
    from corpkit.process import datareader

    dict_found = False

    if type(data) == str:
        if os.path.isdir(data):
            # get list of words
            good = datareader(data, **kwargs)
            # remove bad stuff from result
            regex_nonword_filter = re.compile("[A-Za-z]")
            data = [i for i in good if re.search(regex_nonword_filter, i)]
            return collections.Counter(data)

    while not dict_found:
        if "interrogation" in str(type(data)):
            try:
                data = data.results
            except:
                raise ValueError("Can't find .results branch of input.")

        # if passing in results, sum them
        if type(data) == pandas.core.frame.DataFrame:
            data = data.sum()

        # count sum
        if type(data) == pandas.core.series.Series:
            data = data[data != 0]
            data = collections.Counter(data.to_dict())
            dict_found = True
            return data

        # turn notmal dicts into counter
        if type(data) == dict:
            dict_found = True
            return collections.Counter(data)

        # the best case scenario:
        if type(data) == collections.Counter:
            dict_found = True
            return data

        # filepath stuff
        if type(data) == str:
            if not data.endswith(".p"):
                data = data + ".p"
            try:
                ref_corp_dict = pickle.load(open(data, "rb"))
                dict_found = True
                return ref_corp_dict
            except IOError:
                try:
                    ref_corp_dict = pickle.load(open(os.path.join("dictionaries", data), "rb"))
                    dict_found = True
                    return ref_corp_dict
                except IOError:
                    # try:
                    #    import corpkit
                    #    path_to_corpkit = os.path.dirname(corpkit.__file__)
                    #    thepath, corpkitname = os.path.split(path_to_corpkit)
                    #    dictionaries_path = os.path.join(thepath, 'dictionaries')
                    #    ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) )
                    #    dict_found = True
                    #    return ref_corp_dict
                    # except:
                    pass

            dict_of_dicts = {}
            d_for_print = []

            dicts = [f for f in os.listdir("dictionaries") if f.endswith(".p")]
            for index, d in enumerate(dicts):
                dict_of_dicts[index] = d
                d_for_print.append("    % 2d) %s" % (index, d))

            d_for_print = "\n".join(d_for_print)

            selection = raw_input(
                "\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: "
                % d_for_print
            )
            if selection.startswith("e"):
                import sys

                sys.exit()
            else:
                try:
                    data = dict_of_dicts[int(selection)]
                except:
                    print '\nInput "%s" not recognised.' % data

Example #2

Show file

File: keys.py Project: kareem180/corpkit

def ngrams(
    data,
    clear=True,
    printstatus=True,
    n="all",
    calc_all=True,
    blacklist=False,
    split_contractions=True,
    gramsize=2,
    **kwargs
):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """

    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords

            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from corpkit.keys import keywords_and_ngrams, turn_input_into_counter
    from corpkit.process import datareader

    # loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == "all":
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print "\n%s: Generating ngrams... \n" % time

    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    # print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = "".join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    # print ' '.join(good[:500])

    # if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(
        good, stopwords=stopwords, calc_all=calc_all, show="ngrams", gramsize=gramsize, **kwargs
    )

    import pandas as pd

    out = pd.Series([s for k, s in ngrams], index=[k for k, s in ngrams])
    out.name = "ngrams"

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print "%s: Done! %d results.\n" % (time, len(list(out.index)))

    if n == "all":
        n = len(out)

    return out[:n]