def turn_input_into_counter(data, **kwargs): """from string (filepath) or variable, return a counter.""" import sys import os import re import collections import pickle import pandas from corpkit.process import datareader dict_found = False if type(data) == str: if os.path.isdir(data): # get list of words good = datareader(data, **kwargs) # remove bad stuff from result regex_nonword_filter = re.compile("[A-Za-z]") data = [i for i in good if re.search(regex_nonword_filter, i)] return collections.Counter(data) while not dict_found: if "interrogation" in str(type(data)): try: data = data.results except: raise ValueError("Can't find .results branch of input.") # if passing in results, sum them if type(data) == pandas.core.frame.DataFrame: data = data.sum() # count sum if type(data) == pandas.core.series.Series: data = data[data != 0] data = collections.Counter(data.to_dict()) dict_found = True return data # turn notmal dicts into counter if type(data) == dict: dict_found = True return collections.Counter(data) # the best case scenario: if type(data) == collections.Counter: dict_found = True return data # filepath stuff if type(data) == str: if not data.endswith(".p"): data = data + ".p" try: ref_corp_dict = pickle.load(open(data, "rb")) dict_found = True return ref_corp_dict except IOError: try: ref_corp_dict = pickle.load(open(os.path.join("dictionaries", data), "rb")) dict_found = True return ref_corp_dict except IOError: # try: # import corpkit # path_to_corpkit = os.path.dirname(corpkit.__file__) # thepath, corpkitname = os.path.split(path_to_corpkit) # dictionaries_path = os.path.join(thepath, 'dictionaries') # ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) ) # dict_found = True # return ref_corp_dict # except: pass dict_of_dicts = {} d_for_print = [] dicts = [f for f in os.listdir("dictionaries") if f.endswith(".p")] for index, d in enumerate(dicts): dict_of_dicts[index] = d d_for_print.append(" % 2d) %s" % (index, d)) d_for_print = "\n".join(d_for_print) selection = raw_input( "\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: " % d_for_print ) if selection.startswith("e"): import sys sys.exit() else: try: data = dict_of_dicts[int(selection)] except: print '\nInput "%s" not recognised.' % data
def ngrams( data, clear=True, printstatus=True, n="all", calc_all=True, blacklist=False, split_contractions=True, gramsize=2, **kwargs ): """Feed this function some data and get ngrams. You can use dictmaker() to build a new reference_corpus to serve as reference corpus, or use bnc.p A list of what counts as data is available in the docstring of datareader(). """ import re import time from time import localtime, strftime try: from IPython.display import display, clear_output except ImportError: pass stopwords = False if blacklist is not False: if blacklist is True: from dictionaries.stopwords import stopwords as my_stopwords stopwords = [i.lower() for i in my_stopwords] else: stopwords = [i.lower() for i in blacklist] from corpkit.keys import keywords_and_ngrams, turn_input_into_counter from corpkit.process import datareader # loaded_ref_corpus = turn_input_into_counter(reference_corpus) if n == "all": n = 99999 time = strftime("%H:%M:%S", localtime()) if printstatus: print "\n%s: Generating ngrams... \n" % time good = datareader(data, **kwargs) regex_nonword_filter = re.compile("[0-9A-Za-z-']") good = [i for i in good if re.search(regex_nonword_filter, i)] # print ' '.join(good[:500]) def unsplitter(lst): unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = "".join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit if not split_contractions: good = unsplitter(good) # print ' '.join(good[:500]) # if stopwords: # good = [i for i in good if i not in stopwords] ngrams = keywords_and_ngrams( good, stopwords=stopwords, calc_all=calc_all, show="ngrams", gramsize=gramsize, **kwargs ) import pandas as pd out = pd.Series([s for k, s in ngrams], index=[k for k, s in ngrams]) out.name = "ngrams" # print and return if clear: try: clear_output() except: pass if printstatus: time = strftime("%H:%M:%S", localtime()) print "%s: Done! %d results.\n" % (time, len(list(out.index))) if n == "all": n = len(out) return out[:n]