def dictbuilder(input_path, output_file): """ reads files in input_path input_path needs to have subfolders. if "remove_numbers", does not count numbers (as in "\d+"). This was used for IDing leetspeak. """ worddict=defaultdict(int) for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: print pati for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=inputad.lower() tokenized=ct.tokenizer(inputad) tokenized=[re.sub("\W","", i) for i in tokenized] if remove_numbers: tokenized=[i for i in tokenized if not re.match("\d+", i)] for token in [i for i in tokenized if i]: worddict[token]=worddict[token]+1 print ("\n".join([":".join((k, unicode(worddict[k]))) for k in sorted(worddict, key=worddict.get, reverse=True) if worddict[k] > 50])) print "We created a dictionary of {} total words with {} types".format(sum(worddict.values()), len(worddict.keys())) if output_file: with codecs.open(output_file, "w", "utf-8") as outputfile: json.dump(worddict, outputfile) print "Dict written to ", outputfile
def dictbuilder(input_path, output_file): """ reads files in input_path input_path needs to have subfolders. if "remove_numbers", does not count numbers (as in "\d+"). This was used for IDing leetspeak. """ worddict = defaultdict(int) for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: print pati for fil in [ i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fil) inputad = inputad.lower() tokenized = ct.tokenizer(inputad) tokenized = [re.sub("\W", "", i) for i in tokenized] if remove_numbers: tokenized = [i for i in tokenized if not re.match("\d+", i)] for token in [i for i in tokenized if i]: worddict[token] = worddict[token] + 1 print("\n".join([ ":".join((k, unicode(worddict[k]))) for k in sorted(worddict, key=worddict.get, reverse=True) if worddict[k] > 50 ])) print "We created a dictionary of {} total words with {} types".format( sum(worddict.values()), len(worddict.keys())) if output_file: with codecs.open(output_file, "w", "utf-8") as outputfile: json.dump(worddict, outputfile) print "Dict written to ", outputfile
def collofinder(main_term,regex): for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fil in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) inputad=tagregex.sub(" ", inputad) words=ct.tokenizer(inputad) words=[w.lower() for w in words] #specific words processing for numbers: introduce space between number immediately followed by word-character hits=[w for w in words if regex.match(w) ] #determines length of context extracted context=[-3,-2,-1,0, 1,2, 3] for matched in hits: if [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in words: print "too long" print [words[words.index(matched)+t] for t in [c for c in context if c <1 ]] elif hits and not [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in [words[words.index(matched)+t] for t in [-1,1]] : print fil print [words[words.index(matched)+t] for t in context]
def emoticonfinder(dir): """ The emoticonfinder takes a directory with corpus files as input. We might consider making the file with emoticons an argument as well. The emoticonfinder creates a list of relevant emoticons from a text file. Then counts how often they occur in files in dir. --- Source file is /Users/ps22344/Downloads/chapter2/current/emoticoncounter.py --- """ starttime=time.time() #creating a featuredict from file featuredict={} with codecs.open('/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext: for line in inputtext.readlines(): featuredict[line.rstrip("\n")]=0 #test formatting for k in featuredict: if k.startswith(" "): for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: print pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) words=ct.tokenizer(inputad) for item in words: if item in featuredict: featuredict[item] = featuredict[item]+1 print featuredict endtime=time.time() print "This took us {} minutes".format((endtime-starttime)/60)
def acronymfinder(dir, length, output_json): """ This finds acronyms. Dir is directory of files. Length is length of desired acronym. """ start=time.time() capitals=re.compile("^[A-Z]+$") featuredict=defaultdict(int) #{ #'lol':0 #} for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: print "working on", pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) words=[w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)] for item in words: if (capitals.match(item)) and (len(item) == length): if not spell.spellchecker(item.lower()): featuredict[item] = featuredict[item]+1 print sorted(featuredict.keys()) print "SO many entries: ", len(featuredict) #sorted(d.items(), key=lambda x: x[1]) #[":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)] print "\n".join([":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)]) mid=time.time() print "this took us {} minutes".format((mid-start)/60) if output_json: with codecs.open("output_acronyms"+str(length)+"letters.json", "w", "utf-8") as outputi: json.dump(featuredict, outputi) else: for entry in sorted(featuredict): if featuredict[entry] > 5: print "\n\n\n***",entry,"\n\n" tk.tokenfinder([r"\s"+entry+"\s"], input_path='/Users/ps22344/Downloads/craig_0208/', length=20, lower_case=False) end=time.time() print "this took us {} minutes".format((end-start)/60)
def collofinder(main_term, regex): for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fil in [ i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(dir, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fili) inputad = tagregex.sub(" ", inputad) words = ct.tokenizer(inputad) words = [w.lower() for w in words] #specific words processing for numbers: introduce space between number immediately followed by word-character hits = [w for w in words if regex.match(w)] #determines length of context extracted context = [-3, -2, -1, 0, 1, 2, 3] for matched in hits: if [ i for i in context if words.index(matched) + i > len(words) - 1 ] and search_term in words: print "too long" print[ words[words.index(matched) + t] for t in [c for c in context if c < 1] ] elif hits and not [ i for i in context if words.index(matched) + i > len(words) - 1 ] and search_term in [ words[words.index(matched) + t] for t in [-1, 1] ]: print fil print[words[words.index(matched) + t] for t in context]
def wordcounter(input_dir, category_tag, category_dict): """ counts the words per category in the files in input_dir. Parameters ---------- input_dir is the corpus directoty category_tag is the name of the tag to be extracted with tagextractor. category_dict is a dictionary of categories to be computed over (category names as keys) e.g. <location="X"> would be input with "location" as the category_tag and a dict with {"Austin":0, "Dallas":0, ...} Returns ------- something """ print "Running the wordcounter" resultdict = category_dict for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print pati for fili in [ i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8") as inputfili: inputfili = inputfili.read() wordcount = len( ct.tokenizer(ct.adtextextractor(inputfili, fili), remove_punctuation=True)) category = ct.tagextractor(inputfili, category_tag, fili) if category in resultdict: resultdict[category] = resultdict[category] + wordcount else: print "\n\nWARNING:\n{} is not in the category_dict. What do we do now?\n\n".format( category) print "Wordcounter done" with codecs.open("wordcounter_" + category_tag + ".json", "w", "utf-8") as jsonout: json.dump(resultdict, jsonout)
def dictbuilder(input_dir, output_name, lowercase=False, print_dict=False): """ The dictbuilder puts all words in the corpus (input_dir) into a dictionary and outputs as json. Name of output file determined by output_name. If print_dict is set to True, prints our sorted dictionary. Format of the dict returned: {word:count, word:count, } """ dicti = defaultdict(float) for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili) inputad = [ w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad) ] inputad = [w for w in inputad if w] if lowercase: for word in inputad: dicti[word.lower()] = dicti[word.lower()] + 1 else: for word in inputad: dicti[word] = dicti[word] + 1 if print_dict: print "\n".join([ ":".join((i, str(dicti[i]))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) with codecs.open(output_name + ".json", "w", "utf-8") as outputi: json.dump(dicti, outputi, encoding="utf8") print "Written dictionary with {} items to ".format( len(dicti)), output_name return dicti
def spellingcounter(input_dir): """ The spellingcounter counts the number of mis-spelled words. It uses the PyEnchange library for spellchecking. It iterates over the files in input_dir. It returns a lists of lists with (raw count, relative count) tuples. """ start=time.time() americandict = enchant.Dict("en_US") goodwords=set(["wo", "'ve", "'m", "n't", "'s", "'ll", "'re", "'d", "non-"]+list(string.punctuation)) htmlregex=re.compile("<.*?>") results=[] for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print pati for fili in [i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".")]: #print fili result=[] fili=codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) inputad=htmlregex.sub(" ", inputad) words=ct.tokenizer(inputad) #print "\n\n\n", words wordcount=float(len(words)) mistakes=[w for w in words if not americandict.check(w) and w not in goodwords] #print mistakes if wordcount-len(mistakes) < 0: print "WARNING: negative count-mistakes", wordcount, len(correct), os.path.join(input_dir, pati, fili) results.append([(len(mistakes), len(mistakes)/wordcount)]) #print "\n".join([":".join([i, str(dict[i])]) for i in sorted(dict, key=dict.get, reverse=True)]) end=time.time() print "len results", len(results) print "this took us {} minutes".format((end-start)/60) print "shape of results, number of lists:", len(results), "-- length of lists", set([len(i) for i in results]) #for u in [[x[1] for x in i] for i in results]: # print u #print [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results] return [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args # the plus one in here is for the file id wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count = 0 catdicti = categorymachine(folderlist)[0] filedict = {} featuredict = {k: featuredict[k]["words"] for k in featuredict.keys()} featuredict = {k: set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys()} for folder in folderlist: filis = [i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")] if testmode: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis = filis[:200] print "Building matrices: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputad = ct.adtextextractor(inputfile, fili) # establish category for external_cat in args: cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)] count = count + 1 filedict[count] = os.path.join(pathi, folder, fili) splittext = ct.tokenizer(inputad) splittext = [s for s in splittext if s not in exclude] splittextlo = [s.lower() for s in splittext if s and not excluderegex.match(s)] wordcount = float(len(splittextlo)) # not controlling for cluster size # addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()} # controlling for cluster size addict = {k: (sum([float(splittextlo.count(i)) for i in v])) / len(v) for k, v in featuredict.items()} addict = {k: v / wordcount for k, v in addict.items()} wordvector = np.array([float(cat)] + [float(count)] + addict.values()) # we append it to the matrix wordmatrix = np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" # "In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat = wordmatrix[1 : wordmatrix.shape[0], len(args) + 1 : wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat = wordmatrix[1 : wordmatrix.shape[0],] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def prosodycounter(input_dir): """ Returns a list of lists where each list contains raw and per word counts. """ start = time.time() #creating the search terms prosodyitems = [ "\s(\*(?:laugh|cough|smack|giggle)\*)\s", "\W([Ee][Rr])\W", "\W((?:[Hh][Aa]){1,}[Hh]?)\W", "\W((?:[Hh][Uu]){1,}[Hh]?)\W", "\W((?:[Hh][Ee]){2,}[Hh]?)\W", "\W([Hh][Oo]{2,})\W", "\W([Hh][Mm]{1,})\W", "\W([Hh]e+y{2,})\W", "\W([Hh]e{2,}[Yy]+)\W", "\W" + anyoftheseregex("[Hh]+[Ee]+[Ll][Ll]+[Oo]+") + "\W", "\W([Mm]{2,}[Hh]?)\W", "\W((?:[Mm][Hh]){1,})\W", "\W([Ss][Oo]{2,})\W", "\W([Uu][Hh]+)\W", "\W([Uu][Mm]+)\W", "\W([Yy][Aa]+[Yy]+)\W", "\W([Yy]+[Aa]+[Hh]?)\W" ] excludelist = [] #dicts to store results dicti = defaultdict(float) matchesdicti = defaultdict(list) results = [] prosody_list = [re.compile(i) for i in prosodyitems] print "{} items in the prosody_list, {} unique".format( len(prosody_list), len(set(prosody_list))) print[i.pattern for i in prosody_list] #iterate and match for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili).lower() #result is a list of lists which contain matches for each regex/acronym wordcount = float(len(ct.tokenizer(inputad))) result = [([m for m in i.findall(inputad) if not m in excludelist], i.pattern) for i in prosody_list] #print result results.append([(len(matches), len(matches) / wordcount) for matches, pattern in result]) for matches, pattern in result: #print pattern #the dicti is {pattern:count, pattern: count, ...} dicti[pattern] = dicti[pattern] + len(matches) matchesdicti[pattern] = matchesdicti[pattern] + matches print "\n".join([ ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) end = time.time() print "This took us {} minutes".format((end - start) / 60) # for u in [[x[0] for x in i] for i in results]: # print u print "shape of results, number of lists:", len( results), "-- length of lists", set([len(i) for i in results]) return [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]
#check if we find items starttime=time.time() for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) words=ct.tokenizer(inputad) words=[w.lower() for w in words] #specific words processing for numbers: introduce space between number immediately followed by word-character if [w for w in words if any(k.match(w) for k in numbersdict.keys())]: if words.index(w) not in [0, 1, len(words) -1, len(words)-2]: twodict[words[words.index(w)-2]]=twodict[words[words.index(w)-2]]+1 twodict[words[words.index(w)+2]]=twodict[words[words.index(w)+2]]+1 if words.index(w) not in [0, len(words)-1]: onedict[words[words.index(w)-1]]=onedict[words[words.index(w)-1]]+1 onedict[words[words.index(w)+1]]=onedict[words[words.index(w)+1]]+1 outifile.write("\n".join([" ".join([words[words.index(w)-2], words[words.index(w)-1],w, words[words.index(w)+1], words[words.index(w)+2]]) for w in words if any(k.match(w) for k in numbersdict.keys()) and words.index(w) not in [0, 1, len(words)-1, len(words)-2]])) else: pass outifile.close()
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count = 0 catdicti = categorymachine(folderlist) filedict = {} featuredict = {k: featuredict[k]['words'] for k in featuredict.keys()} featuredict = { k: set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys() } for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] if testmode: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis = filis[:200] print "Building matrices: we have {} files in folder {}".format( len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputad = ct.adtextextractor(inputfile, fili) #establish category for external_cat in args: cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)] count = count + 1 filedict[count] = os.path.join(pathi, folder, fili) splittext = ct.tokenizer(inputad) splittext = [s for s in splittext if s not in exclude] splittextlo = [ s.lower() for s in splittext if s and not excluderegex.match(s) ] wordcount = float(len(splittextlo)) #not controlling for cluster size #addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()} #controlling for cluster size addict = { k: (sum([float(splittextlo.count(i)) for i in v])) / len(v) for k, v in featuredict.items() } addict = {k: v / wordcount for k, v in addict.items()} wordvector = np.array([float(cat)] + [float(count)] + addict.values()) #we append it to the matrix wordmatrix = np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format( np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0], len(args) + 1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def acronymfinder(dir, length, output_json): """ This finds acronyms. Dir is directory of files. Length is length of desired acronym. """ start = time.time() capitals = re.compile("^[A-Z]+$") featuredict = defaultdict(int) # { #'lol':0 # } for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: print "working on", pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili = codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fili) words = [w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)] for item in words: if (capitals.match(item)) and (len(item) == length): if not spell.spellchecker(item.lower()): featuredict[item] = featuredict[item] + 1 print sorted(featuredict.keys()) print "SO many entries: ", len(featuredict) # sorted(d.items(), key=lambda x: x[1]) # [":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)] print "\n".join( [":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)] ) mid = time.time() print "this took us {} minutes".format((mid - start) / 60) if output_json: with codecs.open("output_acronyms" + str(length) + "letters.json", "w", "utf-8") as outputi: json.dump(featuredict, outputi) else: for entry in sorted(featuredict): if featuredict[entry] > 5: print "\n\n\n***", entry, "\n\n" tk.tokenfinder( [r"\s" + entry + "\s"], input_path="/Users/ps22344/Downloads/craig_0208/", length=20, lower_case=False, ) end = time.time() print "this took us {} minutes".format((end - start) / 60)