def wordcounter(folder, list_of_clusters): with codecs.open( "/Users/ps22344/Downloads/chapter2/current/clusterskmeans_54_19_10_07_30.json", "r", "utf-8" ) as jsoninput: wordtovecclusters = json.load(jsoninput) wordtovecclusters = {int(k): set(v["words"]) for k, v in wordtovecclusters.items() if int(k) in list_of_clusters} for key in wordtovecclusters: start = time.time() print key wordcount = {i: 0 for i in wordtovecclusters[key]} filis = folderreader(folder) print "we have {} files to work with".format(len(filis)) for fili in filis: with codecs.open(fili, "r", "utf-8") as inputfile: inputad = ct.adtextextractor(inputfile.read(), fili) addspace = stopregex.sub(r"\g<1> \g<2>", inputad) splittext = nltk.word_tokenize(addspace) splittextlo = [s.lower() for s in splittext if s] splittextlo = [s for s in splittextlo if not s in cluster_stopwords] if "wan" in splittextlo: print splittextlo print inputad for w in wordcount.keys(): wordcount[w] = wordcount[w] + splittextlo.count(w) end = time.time() print "Aha this took us {} minutes".format((end - start) / 60) print "\n", key print [(k, wordcount[k]) for k in sorted(wordcount, key=wordcount.get, reverse=True)]
def tokenfinder(input_list, input_path, lower_case): """ the tokenfinder looks over the items in an input_list. the regex pattern is ".{,40}ITEM.{,40}". it outputs findings in the corpus given in "dir". filename and total number of matches are printed. dir needs to have subfolders. """ starttime=time.time() print "search term is ", input_list #construct the regexes typedict={} for item in input_list: typedict[re.compile(r".{,40}"+unicode(item)+".{,40}")]=0 for typi in typedict: print typi.pattern totalhits=[] #iterate over files for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: #print pati for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) if lower_case: inputad=inputad.lower() matches=[k.findall(inputad) for k in typedict.keys()] if sum([len(i) for i in matches]) > 0: print "{} hits in file {}".format(sum([len(i) for i in matches]), os.path.join(input_path, pati, fil)) print matches, "\n" totalhits.append(sum([len(i) for i in matches])) if sum(totalhits) == 0: print "\n---\nNO MATCHES IN TOKENFINDER\n---\n" else: print "{} matches total".format(sum(totalhits)) endtime=time.time()
def dictbuilder(input_path, output_file): """ reads files in input_path input_path needs to have subfolders. if "remove_numbers", does not count numbers (as in "\d+"). This was used for IDing leetspeak. """ worddict=defaultdict(int) for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: print pati for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=inputad.lower() tokenized=ct.tokenizer(inputad) tokenized=[re.sub("\W","", i) for i in tokenized] if remove_numbers: tokenized=[i for i in tokenized if not re.match("\d+", i)] for token in [i for i in tokenized if i]: worddict[token]=worddict[token]+1 print ("\n".join([":".join((k, unicode(worddict[k]))) for k in sorted(worddict, key=worddict.get, reverse=True) if worddict[k] > 50])) print "We created a dictionary of {} total words with {} types".format(sum(worddict.values()), len(worddict.keys())) if output_file: with codecs.open(output_file, "w", "utf-8") as outputfile: json.dump(worddict, outputfile) print "Dict written to ", outputfile
def categorymachine(folderlist): print "starting category machine" catdicti = {} catnumber = 0 for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) #lets establish the category #we need to make it numeric, so the numpy won't screw up category = ct.tagextractor(inputfile, "category1", fili) try: cat = catdicti[category] except: print "We added {} to the category dictionary, coded as {}".format( category, catnumber) catdicti[ct.tagextractor(inputfile, "category1", fili)] = catnumber catnumber = catnumber + 1 cat = catdicti[ct.tagextractor(inputfile, "category1", fili)] return (catdicti, catnumber)
def emoticonfinder(dir): """ The emoticonfinder takes a directory with corpus files as input. We might consider making the file with emoticons an argument as well. The emoticonfinder creates a list of relevant emoticons from a text file. Then counts how often they occur in files in dir. --- Source file is /Users/ps22344/Downloads/chapter2/current/emoticoncounter.py --- """ starttime=time.time() #creating a featuredict from file featuredict={} with codecs.open('/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext: for line in inputtext.readlines(): featuredict[line.rstrip("\n")]=0 #test formatting for k in featuredict: if k.startswith(" "): for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: print pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) words=ct.tokenizer(inputad) for item in words: if item in featuredict: featuredict[item] = featuredict[item]+1 print featuredict endtime=time.time() print "This took us {} minutes".format((endtime-starttime)/60)
def nonstandardcounter(filelist): """ The nonstandardcounter takes a list of files, then iterates over them. Splits according to the same rules as the matrixmachine. Checks status of each word in PyEnchant (en_US plus mydictwords.txt) and counts how many are "False". Counts theses, returns dictionary of counts per word. It outputs the results as a JSON w/ the file name including year, month, and day. """ count=0 filedict={} typodict=defaultdict(float) for fili in filelist: #print fili #print os.path.join(pathi, fili) inputfile=codecs.open(os.path.join(pathi, fili), "r", "utf-8").read() inputad=ct.adtextextractor(inputfile, fili) count=count+1 filedict[count]=os.path.join(pathi, fili) addspace=stopregex.sub(r"\g<1> \g<2>", inputad) addspace=re.sub("<.*?>", " ", addspace) splittext=nltk.word_tokenize(addspace) #splittext=[s for s in splittext if s not in exclude] splittextlo=[s for s in splittext if s] for word in [w for w in splittextlo if not spelldicti.check(w) and w not in list(punctuation)]: if word == "nofollow": print splittextlo typodict[word]=typodict[word]+1 return (typodict)
def dictbuilder(input_path, output_file): """ reads files in input_path input_path needs to have subfolders. if "remove_numbers", does not count numbers (as in "\d+"). This was used for IDing leetspeak. """ worddict = defaultdict(int) for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: print pati for fil in [ i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fil) inputad = inputad.lower() tokenized = ct.tokenizer(inputad) tokenized = [re.sub("\W", "", i) for i in tokenized] if remove_numbers: tokenized = [i for i in tokenized if not re.match("\d+", i)] for token in [i for i in tokenized if i]: worddict[token] = worddict[token] + 1 print("\n".join([ ":".join((k, unicode(worddict[k]))) for k in sorted(worddict, key=worddict.get, reverse=True) if worddict[k] > 50 ])) print "We created a dictionary of {} total words with {} types".format( sum(worddict.values()), len(worddict.keys())) if output_file: with codecs.open(output_file, "w", "utf-8") as outputfile: json.dump(worddict, outputfile) print "Dict written to ", outputfile
def dictmaker(folderlist, threshold=1000): #this is our general vocab vocab={} #collecting words for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) splittext=nltk.word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] #do we want to lemmatize or things like that for word in splittextlo: if word not in vocab: vocab[word]=1 else: vocab[word]=vocab[word]+1 print "Our vocab dictionary has {} entries".format(len(vocab)) #here we set the threshold featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) return featuredict
def collofinder(main_term,regex): for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fil in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) inputad=tagregex.sub(" ", inputad) words=ct.tokenizer(inputad) words=[w.lower() for w in words] #specific words processing for numbers: introduce space between number immediately followed by word-character hits=[w for w in words if regex.match(w) ] #determines length of context extracted context=[-3,-2,-1,0, 1,2, 3] for matched in hits: if [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in words: print "too long" print [words[words.index(matched)+t] for t in [c for c in context if c <1 ]] elif hits and not [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in [words[words.index(matched)+t] for t in [-1,1]] : print fil print [words[words.index(matched)+t] for t in context]
def wordcounter(folder, list_of_clusters): with codecs.open( '/Users/ps22344/Downloads/chapter2/current/clusterskmeans_54_19_10_07_30.json', 'r', 'utf-8') as jsoninput: wordtovecclusters = json.load(jsoninput) wordtovecclusters = { int(k): set(v['words']) for k, v in wordtovecclusters.items() if int(k) in list_of_clusters } for key in wordtovecclusters: start = time.time() print key wordcount = {i: 0 for i in wordtovecclusters[key]} filis = folderreader(folder) print "we have {} files to work with".format(len(filis)) for fili in filis: with codecs.open(fili, "r", "utf-8") as inputfile: inputad = ct.adtextextractor(inputfile.read(), fili) addspace = stopregex.sub(r"\g<1> \g<2>", inputad) splittext = nltk.word_tokenize(addspace) splittextlo = [s.lower() for s in splittext if s] splittextlo = [ s for s in splittextlo if not s in cluster_stopwords ] if "wan" in splittextlo: print splittextlo print inputad for w in wordcount.keys(): wordcount[w] = wordcount[w] + splittextlo.count(w) end = time.time() print "Aha this took us {} minutes".format((end - start) / 60) print "\n", key print[(k, wordcount[k]) for k in sorted(wordcount, key=wordcount.get, reverse=True)]
def dictmaker(folderlist, threshold, remove_stopwords=True, remove_punct=True): """ The dictmaker counts the words / items contained in the files found in the folders of folderlist. It returns a dictionary of all words that occur more often than the number threshold. remove_stopwords used the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. """ #threshold sets how many times a word needs to occur to be included in the featuredict vocab = {} for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building vocab: we have {} files in folder {}".format( len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) splittext = nltk.word_tokenize(inputtext) splittextlo = [i.lower() for i in splittext] #do we want to lemmatize or things like that for word in splittextlo: if word not in vocab: vocab[word] = 1 else: vocab[word] = vocab[word] + 1 print "Our vocab dictionary has {} entries".format(len(vocab)) ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_" + time.strftime("%H_%M_%m_%d")), vocab) if remove_stopwords: vocab = { key: value for key, value in vocab.items() if key not in stopwords } print "After stop word removal, dict is {} long".format(len(vocab)) if remove_punct: vocab = { key: value for key, value in vocab.items() if key not in punctuation } print "After punctuation removal, dict is {} long".format(len(vocab)) featuredict = { key: value for key, value in vocab.items() if value > float(threshold) } print "Our feature dictionary has {} entries\n---------------\n".format( len(featuredict)) print "This is our featuredict", featuredict ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_" + time.strftime("%H_%M_%m_%d")), featuredict) return featuredict
def clippingcounter(clipping_list, input_dir): """ The clipping uses the clipping_list to count instances of the clippings listed in there. Here, we make that list out of the shorteningdict jsons created earlier. The regex is designed to find lowercase and uppercase versions of each, plus plurals. The input_dir contains the text files to be iterated over. It returns a list of match counts. e.g. clipping_list=['LOL', 'ROFL', 'ASL', 'BRB'] result=[0,0,2,0] """ excludelist = [] #dicts to store results dicti = defaultdict(float) matchesdicti = defaultdict(list) results = [] clipping_list = [ re.compile("[^web|i]\W(" + i + ")\W") if i in ["cams?", "sites?"] else re.compile("\W(" + i + ")\W") for i in clipping_list ] #clipping_list=[re.compile("\W("+i+")\W") for i in clipping_list] clipping_list = set(clipping_list) print[i.pattern for i in clipping_list] #iterate and match for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili).lower() #result is a list of lists which contain matches for each regex/acronym result = [([m for m in i.findall(inputad) if not m in excludelist], i.pattern) for i in clipping_list] # o=[(r,os.path.join(input_dir, dir, fili)) for r in result if len(r[0]) > 2] # if o: # print o results.append([len(matches) for matches, pattern in result]) for matches, pattern in result: #the dicti is {pattern:count, pattern: count, ...} dicti[pattern] = dicti[pattern] + len(matches) matchesdicti[pattern] = matchesdicti[pattern] + matches print "\n".join([ ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) for entry in {k: v for k, v in matchesdicti.items() if v > 10}: print entry tk.tokenfinder([re.sub("[\(\)]", "", entry)], "/Users/ps22344/Downloads/craig_0208") return results
def vec2wordclustercounter(folderlist, cluster_dictionary): """ This is stolen from the cluster_analysis dictmaker. The dictmaker counts the words / items contained in the files found in the folders of folderlist. remove_stopwords uses the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. This was mainly used to test how well the counting in the word2vec analysis works. """ with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson: clusterdict = json.load(inputjson) result = defaultdict(int) #this is just for qc misses = [] for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building vocab: we have {} files in folder {}".format( len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) #pre-processing here inputtext = ct.adcleaner(inputtext, replace_linebreak=True, remove_html=False) splittext = word_tokenize(inputtext) splittextlo = [i.lower() for i in splittext] finaltext = [punctuationregex.sub("", i) for i in splittextlo] finaltext = [i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: cluster = [ k for k, v in clusterdict.items() if word in v['words'] ] if len(cluster) > 1: print "Warning: The item {} was found in more than one clusters".format( word) if len(cluster) < 1: #print "Warning: The item could not be found in a cluster" misses.append(word) else: result[cluster[0]] = result[cluster[0]] + 1 print "Our vocab dictionary has {} entries".format(len(result)) ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_" + time.strftime("%H_%M_%m_%d")), result) # featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } # print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) # print "This is our featuredict", featuredict # ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) print "misses", len(misses), set(misses) print result return result
def acronymcounter(acronym_list, input_dir): """ The acronymcounter uses the acronym_list to count instances of the abbreviations listed in there. Here, we make that list out of the shorteningdict jsons created earlier. The regex is designed to find lowercase and uppercase versions of each, plus plurals. The input_dir contains the text files to be iterated over. It returns a list of match counts. e.g. acronym_list=['LOL', 'ROFL', 'ASL', 'BRB'] result=[0,0,2,0] NOTE:we can consider running location and schools over a different regex that does not include plural s. """ excludelist = set([ "oks", "fbs", "PSS", "VAS", "vas", "BCS", "bcs", "NES", "nes", "SMS", "sms", "SAS", "SSS", "sss", "nsas", "mias" ]) #dicts to store results dicti = defaultdict(float) matchesdicti = defaultdict(list) results = [] #regex, lower and pluralize acronym_list = [ re.compile("\W((?:" + i + "|" + i.lower() + ")[sS]?)\W") for i in acronym_list ] acronym_list = set(acronym_list) print[i.pattern for i in acronym_list] #iterate and match for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili) #result is a list of lists which contain matches for each regex/acronym result = [([m for m in i.findall(inputad) if not m in excludelist], i.pattern) for i in acronym_list] results.append([len(matches) for matches, pattern in result]) for matches, pattern in result: #the dicti is {pattern:count, pattern: count, ...} dicti[pattern] = dicti[pattern] + len(matches) matchesdicti[pattern] = matchesdicti[pattern] + matches print "\n".join([ ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) return results
def acronymfinder(dir, length, output_json): """ This finds acronyms. Dir is directory of files. Length is length of desired acronym. """ start = time.time() capitals = re.compile("^[A-Z]+$") featuredict = defaultdict(int) # { #'lol':0 # } for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: print "working on", pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili = codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fili) words = [w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)] for item in words: if (capitals.match(item)) and (len(item) == length): if not spell.spellchecker(item.lower()): featuredict[item] = featuredict[item] + 1 print sorted(featuredict.keys()) print "SO many entries: ", len(featuredict) # sorted(d.items(), key=lambda x: x[1]) # [":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)] print "\n".join( [":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)] ) mid = time.time() print "this took us {} minutes".format((mid - start) / 60) if output_json: with codecs.open("output_acronyms" + str(length) + "letters.json", "w", "utf-8") as outputi: json.dump(featuredict, outputi) else: for entry in sorted(featuredict): if featuredict[entry] > 5: print "\n\n\n***", entry, "\n\n" tk.tokenfinder( [r"\s" + entry + "\s"], input_path="/Users/ps22344/Downloads/craig_0208/", length=20, lower_case=False, ) end = time.time() print "this took us {} minutes".format((end - start) / 60)
def capsfinder(input_dir, input_dict): results = [] #dicti is results by word/item dicti = defaultdict(float) #matchesdicti is results by Regexpattern matchesdicti = defaultdict(list) search_terms = [i for i in input_dict.keys()] print "search terms", [i.pattern for i in search_terms] for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili) #we exclude anything we have in our abbreviations dict #no, we cover this by subtracting the results later result = [ ([t for t in i.findall(inputad) if not t in abbreviations], i.pattern) for i in search_terms ] #print result if len(result) > 1: print "warning result > 1", len(result), result #this is the count we returs results.append([len(matches) for matches, pattern in result]) #here we inspect findings. note resultS vs result for matches, pattern in result: if len(matches) > 100: print "matches", len(matches), os.path.join( input_dir, dir, fili) #the dicti is {pattern:count, pattern: count, ...} for res in matches: dicti[res] = dicti[res] + 1 #print len(matches[0]), 'total', len(matches) #matchesdicti collects the matches per regex, dicti per feature matchesdicti[pattern] = matchesdicti[pattern] + matches print "\n".join([ ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) # for entry in {k:v for k,v in matchesdicti.items()}: # print "\n", entry, matchesdicti[entry] # for entry in dicti: # print entry, dicti[entry] return results
def tokenfinder(input_list, input_path, length=40, lower_case=True): """ the tokenfinder looks over the items in an input_list. the regex pattern is ".{,40}ITEM.{,40}". it outputs findings in the corpus given in "dir". filename and total number of matches are printed. dir needs to have subfolders. """ starttime = time.time() allhits = [] print "search term is ", input_list #construct the regexes typedict = {} for item in input_list: typedict[re.compile(r".{," + str(length) + "}" + unicode(item) + ".{," + str(length) + "}")] = 0 for typi in typedict: print "***", typi.pattern totalhits = [] #iterate over files for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: #print pati for fil in [ i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fili) if lower_case: #print "were lowercasing" inputad = inputad.lower() matches = [k.findall(inputad) for k in typedict.keys()] if sum([len(i) for i in matches]) > 0: print "{} hits in file {}".format( sum([len(i) for i in matches]), os.path.join(input_path, pati, fil)) print matches, "\n" totalhits.append(sum([len(i) for i in matches])) allhits.append(matches) if sum(totalhits) == 0: print "\n---\nNO MATCHES IN TOKENFINDER\n---\n" else: print "{} matches total".format(sum(totalhits)) endtime = time.time() return allhits
def vec2wordclustercounter(folderlist, cluster_dictionary): """ This is stolen from the cluster_analysis dictmaker. The dictmaker counts the words / items contained in the files found in the folders of folderlist. remove_stopwords uses the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. This was mainly used to test how well the counting in the word2vec analysis works. """ with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson: clusterdict=json.load(inputjson) result=defaultdict(int) #this is just for qc misses=[] for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) #pre-processing here inputtext=ct.adcleaner(inputtext ,replace_linebreak=True, remove_html=False) splittext=word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] finaltext=[punctuationregex.sub("",i) for i in splittextlo] finaltext=[i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: cluster= [k for k,v in clusterdict.items() if word in v['words']] if len(cluster) > 1: print "Warning: The item {} was found in more than one clusters".format(word) if len(cluster) < 1: #print "Warning: The item could not be found in a cluster" misses.append(word) else: result[cluster[0]]=result[cluster[0]]+1 print "Our vocab dictionary has {} entries".format(len(result)) ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), result) # featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } # print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) # print "This is our featuredict", featuredict # ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) print "misses", len(misses), set(misses) print result return result
def rebusfinder(input_path, word_dictionary, number_dictionary, excluded_words): """ This finds words that are represented as numbers. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre and exclude_post word for negative contexts in 4. It print the results and give type and token counts. """ #with codecs.open(word_dictionary, "r", "utf-8") as worddictionary: # worddictionary=json.load(worddictionary) #worddictionary={k:v for k,v in worddictionary.items() if not k in excluded_words and worddictionary[k] > 1} for number in number_dictionary.keys(): numberregex=re.compile("\W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W") #just for now h0dict=defaultdict(int) h2dict=defaultdict(int) print numberregex.pattern for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=inputad.lower() hits=numberregex.findall(inputad) #this weeds out all the phonenumbers. hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict] for h in hits: if h[0] in include_pre_context or h[2] in include_post_context: print h h0dict[h[0]]=h0dict[h[0]]+1 h2dict[h[2]]=h2dict[h[2]]+1 elif h[0] not in exclude_pre_context and h[2] not in exclude_post_context: if h[2]:#:=="days": print h h0dict[h[0]]=h0dict[h[0]]+1 h2dict[h[2]]=h2dict[h[2]]+1 print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) h0dict={k:v for k,v in h0dict.items() if v > 0} print "\n\n", number, "\n\posttext here be the results\n\n" #print "\n".join([": ".join([k, unicode(h0dict[k])]) for k in sorted(h0dict, key=h0dict.get, reverse=True)]) print "\n".join([": ".join([k, unicode(h2dict[k])]) for k in sorted(h2dict, key=h2dict.get, reverse=True)]) print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
def acronymfinder(dir, length, output_json): """ This finds acronyms. Dir is directory of files. Length is length of desired acronym. """ start=time.time() capitals=re.compile("^[A-Z]+$") featuredict=defaultdict(int) #{ #'lol':0 #} for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: print "working on", pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) words=[w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)] for item in words: if (capitals.match(item)) and (len(item) == length): if not spell.spellchecker(item.lower()): featuredict[item] = featuredict[item]+1 print sorted(featuredict.keys()) print "SO many entries: ", len(featuredict) #sorted(d.items(), key=lambda x: x[1]) #[":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)] print "\n".join([":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)]) mid=time.time() print "this took us {} minutes".format((mid-start)/60) if output_json: with codecs.open("output_acronyms"+str(length)+"letters.json", "w", "utf-8") as outputi: json.dump(featuredict, outputi) else: for entry in sorted(featuredict): if featuredict[entry] > 5: print "\n\n\n***",entry,"\n\n" tk.tokenfinder([r"\s"+entry+"\s"], input_path='/Users/ps22344/Downloads/craig_0208/', length=20, lower_case=False) end=time.time() print "this took us {} minutes".format((end-start)/60)
def dictmaker(folderlist, threshold, remove_stopwords=True, remove_punct=True): """ The dictmaker counts the words / items contained in the files found in the folders of folderlist. It returns a dictionary of all words that occur more often than the number threshold. remove_stopwords used the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. """ #threshold sets how many times a word needs to occur to be included in the featuredict vocab={} for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) #pre-processing here inputtext=adcleaner(inputtext ,replace_linebreak=True, remove_html=False) splittext=word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] finaltext=[punctuationregex.sub("",i) for i in splittextlo] finaltext=[i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: if word not in vocab: vocab[word]=1 else: vocab[word]=vocab[word]+1 print "Our vocab dictionary has {} entries".format(len(vocab)) ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), vocab) if remove_stopwords: vocab= {key:value for key, value in vocab.items() if key not in stopwords } print "After stop word removal, dict is {} long".format(len(vocab)) if remove_punct: vocab= {key:value for key, value in vocab.items() if key not in punctuation } print "After punctuation removal, dict is {} long".format(len(vocab)) featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) print "This is our featuredict", featuredict ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) return featuredict
def categorymachine(folderlist): print "starting category machine" catdicti={} catnumber=0 for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith (".")] for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder,fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) # lets establish the category # we need to make it numeric, so the numpy won't screw up category=ct.tagextractor(inputfile, "category1", fili) try: cat=catdicti[category] except: print "We added {} to the category dictionary, coded as {}".format(category, catnumber) catdicti[ct.tagextractor(inputfile, "category1", fili)]=catnumber catnumber=catnumber+1 cat=catdicti[ct.tagextractor(inputfile, "category1", fili)] return (catdicti, catnumber)
def collofinder(main_term, regex): for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fil in [ i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(dir, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fili) inputad = tagregex.sub(" ", inputad) words = ct.tokenizer(inputad) words = [w.lower() for w in words] #specific words processing for numbers: introduce space between number immediately followed by word-character hits = [w for w in words if regex.match(w)] #determines length of context extracted context = [-3, -2, -1, 0, 1, 2, 3] for matched in hits: if [ i for i in context if words.index(matched) + i > len(words) - 1 ] and search_term in words: print "too long" print[ words[words.index(matched) + t] for t in [c for c in context if c < 1] ] elif hits and not [ i for i in context if words.index(matched) + i > len(words) - 1 ] and search_term in [ words[words.index(matched) + t] for t in [-1, 1] ]: print fil print[words[words.index(matched) + t] for t in context]
def wordcounter(input_dir, category_tag, category_dict): """ counts the words per category in the files in input_dir. Parameters ---------- input_dir is the corpus directoty category_tag is the name of the tag to be extracted with tagextractor. category_dict is a dictionary of categories to be computed over (category names as keys) e.g. <location="X"> would be input with "location" as the category_tag and a dict with {"Austin":0, "Dallas":0, ...} Returns ------- something """ print "Running the wordcounter" resultdict = category_dict for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print pati for fili in [ i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8") as inputfili: inputfili = inputfili.read() wordcount = len( ct.tokenizer(ct.adtextextractor(inputfili, fili), remove_punctuation=True)) category = ct.tagextractor(inputfili, category_tag, fili) if category in resultdict: resultdict[category] = resultdict[category] + wordcount else: print "\n\nWARNING:\n{} is not in the category_dict. What do we do now?\n\n".format( category) print "Wordcounter done" with codecs.open("wordcounter_" + category_tag + ".json", "w", "utf-8") as jsonout: json.dump(resultdict, jsonout)
def dictbuilder(input_dir, output_name, lowercase=False, print_dict=False): """ The dictbuilder puts all words in the corpus (input_dir) into a dictionary and outputs as json. Name of output file determined by output_name. If print_dict is set to True, prints our sorted dictionary. Format of the dict returned: {word:count, word:count, } """ dicti = defaultdict(float) for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili) inputad = [ w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad) ] inputad = [w for w in inputad if w] if lowercase: for word in inputad: dicti[word.lower()] = dicti[word.lower()] + 1 else: for word in inputad: dicti[word] = dicti[word] + 1 if print_dict: print "\n".join([ ":".join((i, str(dicti[i]))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) with codecs.open(output_name + ".json", "w", "utf-8") as outputi: json.dump(dicti, outputi, encoding="utf8") print "Written dictionary with {} items to ".format( len(dicti)), output_name return dicti
def spellingcounter(input_dir): """ The spellingcounter counts the number of mis-spelled words. It uses the PyEnchange library for spellchecking. It iterates over the files in input_dir. It returns a lists of lists with (raw count, relative count) tuples. """ start=time.time() americandict = enchant.Dict("en_US") goodwords=set(["wo", "'ve", "'m", "n't", "'s", "'ll", "'re", "'d", "non-"]+list(string.punctuation)) htmlregex=re.compile("<.*?>") results=[] for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print pati for fili in [i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".")]: #print fili result=[] fili=codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) inputad=htmlregex.sub(" ", inputad) words=ct.tokenizer(inputad) #print "\n\n\n", words wordcount=float(len(words)) mistakes=[w for w in words if not americandict.check(w) and w not in goodwords] #print mistakes if wordcount-len(mistakes) < 0: print "WARNING: negative count-mistakes", wordcount, len(correct), os.path.join(input_dir, pati, fili) results.append([(len(mistakes), len(mistakes)/wordcount)]) #print "\n".join([":".join([i, str(dict[i])]) for i in sorted(dict, key=dict.get, reverse=True)]) end=time.time() print "len results", len(results) print "this took us {} minutes".format((end-start)/60) print "shape of results, number of lists:", len(results), "-- length of lists", set([len(i) for i in results]) #for u in [[x[1] for x in i] for i in results]: # print u #print [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results] return [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]
def charactercounter(input_dir, input_dict): results=[] dicti=defaultdict(float) matchesdicti=defaultdict(list) #search_terms=set([t for i in input_dict.values() for t in i]) search_terms=[re.compile("|".join(i)) for i in input_dict.values()] print "search terms", [i.pattern for i in search_terms] for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".")]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad=ct.adtextextractor(inputtext.read(), fili) #result is a list of lists which contain matches for each regex/acronym #the list incomprehension just deletes empty search results from the "|" search result=[([t for m in i.findall(inputad) for t in m if t], i.pattern) for i in search_terms] #print result results.append([len(matches) for matches, pattern in result]) for matches, pattern in result: if len(matches) > 0: print "multiple matches", matches, os.path.join(input_dir, dir, fili) #if len(matches) > 0: #print len(matches) #the dicti is {pattern:count, pattern: count, ...} for res in matches[0]: dicti[res]=dicti[res]+1 #print len(matches[0]), 'total', len(matches) #print inputad[inputad.index(matches[0])-20:inputad.index(matches[0])+20] #matchesdicti collects the matches per regex, dicti per feature matchesdicti[pattern]=matchesdicti[pattern]+matches #print "\n".join([":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True)]) for entry in {k:v for k,v in matchesdicti.items()}: print "\n", entry, matchesdicti[entry] for entry in dicti: print entry, dicti[entry] for entry in matchesdicti: tk.tokenfinder(["(.{,20})(?<![A-Z] [A-Z]|Ave| MA)\s+(N)\s+(?!Houston|Ballard|word|Royaton|Wilmot|Tucson|Dallas|Warren|side|Avalon|St Pete|Scottsdale|Tampa|C[Oo][Uu][Nn][Tt][Yy]|[Rr][Oo][Ll][Ll]|Arl\.|Royaltown|Golden Isles|Oeleans|Ballard Rd|Broward|Ward|angola|Oracle|[Hubert|1st] Ave|European|Tryon|Hill\w+ |Wil\w+|[Ss][Uu][Bb][Jj][Ee][Cc][Tt]|state line|for now|with a dick|OT |of (\s+Dayton|Talla\w+)|THE INSIDE|THE SURROUNDING|TIME|AUGHTY|[A-Z] [A-Z] |& 5th)(.{,20})"], "/home/ps22344/Downloads/craig_0208", length= 50, lower_case=False) return results
def characterfinder(input_dir, input_dict): results = [] dicti = defaultdict(float) matchesdicti = defaultdict(list) for entry in input_dict: print entry characterlist = set([re.compile(" " + i + " ") for i in input_dict.keys()]) print[i.pattern for i in characterlist] for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili) #result is a list of lists which contain matches for each regex/acronym result = [([m for m in i.findall(inputad) if not m in excludelist], i.pattern) for i in characterlist] results.append([len(matches) for matches, pattern in result]) for matches, pattern in result: #the dicti is {pattern:count, pattern: count, ...} dicti[pattern] = dicti[pattern] + len(matches) matchesdicti[pattern] = matchesdicti[pattern] + matches print "\n".join([ ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) for entry in {k: v for k, v in matchesdicti.items() if v > 10}: print entry tk.tokenfinder([re.sub("[\(\)]", "", entry)], "/Users/ps22344/Downloads/craig_0208", lower_case=False) return results
def rebusfinder_too(input_path): """ The rebus_too finder. It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count instances where a writer uses "2" instead of "too". """ predict=defaultdict(int) postdict=defaultdict(int) for number in [2]: results=[] #this is the regular expression to identify instances of the number studied numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W") print numberregex.pattern #dicts to store statistics about context of number h0dict=defaultdict(int) h2dict=defaultdict(int) #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output previous_patterns=[] results=[] for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=ct.adcleaner(inputad, replace_linebreak=True) inputad=inputad.lower() hits=numberregex.findall(inputad) #this weeds out all the phonenumbers. hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict] for h in hits: #this is needed for instance where there is no punctuation h=[" " if i == "" else i for i in h] """ thus [(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')] pre, "2", optional punctuation, post """ [pre, pre_punct, number, punct, post]=pos_tag(h) if ( #unique items catcher (pre[0] in ["date"]) or (pre[0] in ["it"] and post[0] in ["i"]) or (pre[0] in ["cook"] and post[0] in ["im"]) or (pre[0] in ["kids"] and post[0] in ["young"]) or (pre[0] in ["life", "way"] and post[0] in ["short"]) or (pre[0] in ["that"] and post[0] in ["hard"]) or (pre[0] in ["real"] and post[0] in ["hope"]) or (pre[0] in ["me"] and post[0] in ["if"]) or (pre[0] in ["dogs"] and post[0] in ["if"]) or (pre[0] in ["can"] and post[0] in ["but"]) or (pre[0] in ["kool"] and not post[0] in ["even"]) or (pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later")) or (pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"]) or (pre[0] in ["real"] and post[0] in ["have"]) or #BIGGER NETS #you be too in front of punctuation catch (pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"]) or #this is if we know the pre-word and 2 is followed by punctuation # cf 'intellectualy ability 2. ' (pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"]) or #this is if we know the word to follow # cf 'not 2 late.' collected in postwords (post[0] in postwords) or #this is if we know the word to precede (pre[0] in prewords) ): print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil) results.append((pre, number, punct, post, os.path.join(input_path, pati, fil))) predict[pre[0]]=predict[pre[0]]+1 postdict[post[0]]=postdict[post[0]]+1 print "original result list is", len(results) seti=set(results) print "\n\n", seti print "the set is ", len(seti) overlap={k:results.count(k) for k in seti} print overlap print {k:overlap[k] for k in overlap if overlap[k] > 1} print "PRE CONTEXT" print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)]) print "POST CONTEXT" print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])
""" if replace_linebreak: text=linebreakregex.sub(".", text) text=stopregex.sub(r"\g<1> \g<2>", text) if remove_html: text=htmlregex.sub(" ", text) return text for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis[:10]: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) print "\n\n\npre",inputtext #pre-processing here inputtext=adcleaner(inputtext ,replace_linebreak=True, remove_html=False) splittext=word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] print "\n past", splittextlo finaltext=[punctuationregex.sub("",i) for i in splittextlo] finaltext=[i for i in finaltext if i and i not in ['br']] print finaltext # def sentencefeeder(text): # sents=sent_tokenize(text)
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count=0 catdicti=categorymachine(folderlist)[0] filedict={} featuredict={k:featuredict[k]['words'] for k in featuredict.keys()} featuredict={k:set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys()} for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")] if testmode: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis=filis[:200] print "Building matrices: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputad=ct.adtextextractor(inputfile, fili) #establish category for external_cat in args: cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)] count=count+1 filedict[count]=os.path.join(pathi, folder, fili) addspace=stopregex.sub(r"\g<1> \g<2>", inputad) splittext=nltk.word_tokenize(addspace) splittext=[s for s in splittext if s not in exclude] splittextlo=[s.lower() for s in splittext if s and not excluderegex.match(s)] wordcount=float(len(splittextlo)) #for each word2vec cluster: cluster/total words # this is a per word frequency #for t in featuredict: #print "\n", t#featuredict[t] #print [splittextlo.count(i) for i in featuredict[t]] #if sum ([splittextlo.count(i) for i in set(featuredict[t])]) > 10: # print [i for i in splittextlo if i in featuredict[t]] #addict={k:[i for i in v] for k,v in featuredict.items()} addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()} addict={k:v/wordcount for k,v in addict.items()} #print addict wordvector=np.array([float(cat)]+[float(count)]+addict.values()) #we append it to the matrix wordmatrix=np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
dir = '/Users/ps22344/Downloads/craig_0208/' #adfiles_output_0116' #check if we find items starttime = time.time() for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fil in [ i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(dir, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fili) inputad = inputad.lower() matches = [k.findall(inputad) for k in numbersdict.keys()] if sum([len(i) for i in matches]) > 0: print "hits", sum([len(i) for i in matches]), fil print matches print "our numbersdict", numbersdict endtime = time.time() print "This took us {} minutes".format((endtime - starttime) / 60)
def rebusfinder_too(input_path, number_dictionary): """ This finds words that are represented as numbers. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre and exclude_post word for negative contexts in 4. It prints the results and give type and token counts. """ for number in number_dictionary.keys(): #this is for comments to self print "PRE" #this is the regular expression to identify instances of the number studied numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W") print numberregex.pattern #dicts to store statistics about context of number h0dict=defaultdict(int) h2dict=defaultdict(int) #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output previous_patterns=[] results=[] for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=ct.adcleaner(inputad, replace_linebreak=True) inputad=inputad.lower() hits=numberregex.findall(inputad) #this weeds out all the phonenumbers. hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict] for h in hits: #this is needed for instance where there is no punctuation h=[" " if i == "" else i for i in h] """ thus [(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')] pre, "2", optional punctuation, post """ [pre, pre_punct, number, punct, post]=pos_tag(h) if (post[1] in ["NNS"]) and (punct[0] in [" "]): print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil) search_pattern=[re.escape(i) for i in [pre[0],number[0], punct[0], post[0]]] if search_pattern not in previous_patterns: tk.tokenfinder(["\s*".join(search_pattern)], dir) previous_patterns.append(search_pattern) else: print "SEE TOKENFINDER RESULTS ABOVE\n" #error catching here # # for h in hits: # if h[2]:#==".": # print h, os.path.join(input_path, pati, fil) # print pos_tag(h), "\n" #if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context): #tagged=pos_tag(h), fil #print tagged #if h[2] not in [" "]: # print tagged, os.path.join(input_path, pati, fil) #print inputad #h0dict[h[0]]=h0dict[h[0]]+1 #h2dict[h[2]]=h2dict[h[2]]+1 #h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1 #h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1 #taking out trash # if ( # (tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) # or # (tagged[0][1]=="IN" and h[0] not in ["out", "like"]) # or # (tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"]) # or # (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"]) # or # #this is where we screw up # (tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"]) # or # (tagged[2][1]=="IN") # or # (tagged[2][1]=="CC" and h[2] not in ["but"]) # or # #we don't need this if we are to just ignore whatever goes thru all of it # #TEMPTEMPTEMP # (h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"]) # or # (h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"]) # or # (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"]) # ): # #print "killed",tagged, "\n" # pass # # #finding the good # elif ( # (tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"]) # or # (tagged[2][1] in ["JJ"] and h[0] in ["opposed"]) # or # (tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1])) # or # (h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"]) # or # (h[0] == "like" and h[2] not in ["furry", "cuz", "straight"]) # or # (h[0] in ["here"] and nounregex.match(tagged[2][1])) # or # #really what we are exluding here is anything non-Verb or Noun # # we can consider replacing this with a regex # (h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"]) # or # (h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"]) # or # (h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"]) # ): # #print "hooked the plusloop", tagged # print tagged # results.append(tagged) # h0dict[h[0]]=h0dict[h[0]]+1 # h2dict[h[2]]=h2dict[h[2]]+1 # else: # pass #if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]: # print tagged #print "elseloop", tagged # h0dict[h[0]]=h0dict[h[0]]+1 # h2dict[h[2]]=h2dict[h[2]]+1 #h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1 #h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1 print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) h0dict={k:v for k,v in h0dict.items() if v > 0} print "\n\n", number, "\npretext here be the results\n\n" print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)]) print "\n\n", number, "\nposttext here be the results\n\n" print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)]) print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) return results
return text for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building vocab: we have {} files in folder {}".format( len(filis), folder) #collect a dictionary with all words #lowercase them for fili in filis[:10]: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) print "\n\n\npre", inputtext #pre-processing here inputtext = adcleaner(inputtext, replace_linebreak=True, remove_html=False) splittext = word_tokenize(inputtext) splittextlo = [i.lower() for i in splittext] print "\n past", splittextlo finaltext = [punctuationregex.sub("", i) for i in splittextlo] finaltext = [i for i in finaltext if i and i not in ['br']] print finaltext # def sentencefeeder(text): # sents=sent_tokenize(text) # #print sents
def rebusfinder(input_path, word_dictionary, number_dictionary, excluded_words): """ This finds the word "to" that represented as the number 2. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre_context and exclude_post_context exclude instances where a word follows (post) or precedes (pre) the "2" per regex. Procedure: Eliminate all pre and post contexts; POS tag the remaining ones and eliminate certain combinations; Find positives by POS tag and a word list; Dismiss the remaining ones. It returns a list of positives. It print the results and give type and token counts. """ #with codecs.open(word_dictionary, "r", "utf-8") as worddictionary: # worddictionary=json.load(worddictionary) #worddictionary={k:v for k,v in worddictionary.items() if not k in excluded_words and worddictionary[k] > 1} for number in number_dictionary.keys(): numberregex=re.compile("\W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W") #just for now h0dict=defaultdict(int) h2dict=defaultdict(int) print numberregex.pattern results=[] for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=inputad.lower() hits=numberregex.findall(inputad) #this weeds out all the phonenumbers. hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict] for h in hits: #print h if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context): tagged=pos_tag(h) #taking out trash if ( (tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) or (tagged[0][1]=="IN" and h[0] not in ["out", "like"]) or (tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"]) or (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"]) or #this is where we screw up (tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"]) or (tagged[2][1]=="IN") or (tagged[2][1]=="CC" and h[2] not in ["but"]) or #we don't need this if we are to just ignore whatever goes thru all of it #TEMPTEMPTEMP (h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"]) or (h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"]) or (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"]) ): #print "killed",tagged, "\n" pass #finding the good elif ( (tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"]) or (tagged[2][1] in ["JJ"] and h[0] in ["opposed"]) or (tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1])) or (h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"]) or (h[0] == "like" and h[2] not in ["furry", "cuz", "straight"]) or (h[0] in ["here"] and nounregex.match(tagged[2][1])) or #really what we are exluding here is anything non-Verb or Noun # we can consider replacing this with a regex (h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"]) or (h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"]) or (h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"]) ): #print "hooked the plusloop", tagged print tagged results.append(tagged) h0dict[h[0]]=h0dict[h[0]]+1 h2dict[h[2]]=h2dict[h[2]]+1 else: pass #if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]: # print tagged #print "elseloop", tagged # h0dict[h[0]]=h0dict[h[0]]+1 # h2dict[h[2]]=h2dict[h[2]]+1 #h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1 #h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1 print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) h0dict={k:v for k,v in h0dict.items() if v > 0} print "\n\n", number, "\npretext here be the results\n\n" print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)]) print "\n\n", number, "\nposttext here be the results\n\n" print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)]) print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) return results
#check if we find items starttime=time.time() for pati in [i for i in os.listdir(dir) if not i.startswith(".")]: """ this looks over the keys of a dictionary that are regex patterns. it outputs findings in the corpus given in "dir" with context. dir needs to have subfolders. the twodict counts words with a distance of 2, the onedict counts words with a distance of 1. """ print pati for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fili) words=ct.tokenizer(inputad) words=[w.lower() for w in words] #specific words processing for numbers: introduce space between number immediately followed by word-character if [w for w in words if any(k.match(w) for k in numbersdict.keys())]: if words.index(w) not in [0, 1, len(words) -1, len(words)-2]: twodict[words[words.index(w)-2]]=twodict[words[words.index(w)-2]]+1 twodict[words[words.index(w)+2]]=twodict[words[words.index(w)+2]]+1 if words.index(w) not in [0, len(words)-1]: onedict[words[words.index(w)-1]]=onedict[words[words.index(w)-1]]+1 onedict[words[words.index(w)+1]]=onedict[words[words.index(w)+1]]+1 outifile.write("\n".join([" ".join([words[words.index(w)-2], words[words.index(w)-1],w, words[words.index(w)+1], words[words.index(w)+2]]) for w in words if any(k.match(w) for k in numbersdict.keys()) and words.index(w) not in [0, 1, len(words)-1, len(words)-2]])) else: pass
def matrixmachine(folderlist, featuredict, testmode, *args): """ The matrixmachine creates matrices of word frequencies. It returns wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering. wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. It takes The folderlist is a collection of folders to iterate over. The featuredict is a dictionary containing the words to count. If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. (Note that the testmode comes all the way from main()) The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. Args will be added to the matrix_with_cat. """ print "Starting the matrixmachine" print "external categories: ", len(args) print args #the plus one in here is for the file id wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1))) print "Matrix initial shape: ", np.shape(wordmatrix) # making a dictionary for the categories # we need the zero cause the machine returns 2 items count = 0 catdicti = categorymachine(folderlist) filedict = {} featuredict = {k: featuredict[k]['words'] for k in featuredict.keys()} featuredict = { k: set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys() } for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] if testmode: print "\n\nRUNNING\nIN\nTEST\nMODE\n" filis = filis[:200] print "Building matrices: we have {} files in folder {}".format( len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputad = ct.adtextextractor(inputfile, fili) #establish category for external_cat in args: cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)] count = count + 1 filedict[count] = os.path.join(pathi, folder, fili) splittext = ct.tokenizer(inputad) splittext = [s for s in splittext if s not in exclude] splittextlo = [ s.lower() for s in splittext if s and not excluderegex.match(s) ] wordcount = float(len(splittextlo)) #not controlling for cluster size #addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()} #controlling for cluster size addict = { k: (sum([float(splittextlo.count(i)) for i in v])) / len(v) for k, v in featuredict.items() } addict = {k: v / wordcount for k, v in addict.items()} wordvector = np.array([float(cat)] + [float(count)] + addict.values()) #we append it to the matrix wordmatrix = np.append(wordmatrix, [wordvector], axis=0) print "Features of word matrix: shape {}, dtype {}".format( np.shape(wordmatrix), wordmatrix.dtype) print "---------------\nEnd of public service announcements\n\n" #"In 2D, the first dimension corresponds to rows, the second to columns." # we don't look at the first row cause that was just for initialization # the one without cats we put into the clustering algorithm wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0], len(args) + 1:wordmatrix.shape[1]] print "without", np.shape(wordmatrix_without_cat) wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ] print "with", np.shape(wordmatrix_with_cat) return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
def rebusfinder(input_path, word_dictionary, number_dictionary, excluded_words): """ This finds words that are represented as numbers. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre and exclude_post word for negative contexts in 4. It print the results and give type and token counts. """ #with codecs.open(word_dictionary, "r", "utf-8") as worddictionary: # worddictionary=json.load(worddictionary) #worddictionary={k:v for k,v in worddictionary.items() if not k in excluded_words and worddictionary[k] > 1} for number in number_dictionary.keys(): numberregex = re.compile("\W([a-z]+)\s+(" + unicode(number) + ")\s+([a-z]+)\W") #just for now h0dict = defaultdict(int) h2dict = defaultdict(int) print numberregex.pattern for pati in [ i for i in os.listdir(input_path) if not i.startswith(".") ]: for fil in [ i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fil) inputad = inputad.lower() hits = numberregex.findall(inputad) #this weeds out all the phonenumbers. hits = [ h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict ] for h in hits: if h[0] in include_pre_context or h[ 2] in include_post_context: print h h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 elif h[0] not in exclude_pre_context and h[ 2] not in exclude_post_context: if h[2]: #:=="days": print h h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 print "We have {} items with a token count of {}".format( len(h0dict.keys()), sum(h0dict.values())) h0dict = {k: v for k, v in h0dict.items() if v > 0} print "\n\n", number, "\n\posttext here be the results\n\n" #print "\n".join([": ".join([k, unicode(h0dict[k])]) for k in sorted(h0dict, key=h0dict.get, reverse=True)]) print "\n".join([ ": ".join([k, unicode(h2dict[k])]) for k in sorted(h2dict, key=h2dict.get, reverse=True) ]) print "We have {} post items with a token count of {}".format( len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format( len(h0dict.keys()), sum(h0dict.values()))
def rebusfinder_too(input_path, number_dictionary): """ This finds words that are represented as numbers. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre and exclude_post word for negative contexts in 4. It prints the results and give type and token counts. """ for number in number_dictionary.keys(): #this is for comments to self print "PRE" #this is the regular expression to identify instances of the number studied numberregex = re.compile("\W([a-z]+)\s*(" + punctuationregex + ")?\s*(" + unicode(number) + ")(?:\s+)?(" + punctuationregex + ")?(?:\s+)?([a-z]+)\W") print numberregex.pattern #dicts to store statistics about context of number h0dict = defaultdict(int) h2dict = defaultdict(int) #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output previous_patterns = [] results = [] for pati in [ i for i in os.listdir(input_path) if not i.startswith(".") ]: for fil in [ i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fil) inputad = ct.adcleaner(inputad, replace_linebreak=True) inputad = inputad.lower() hits = numberregex.findall(inputad) #this weeds out all the phonenumbers. hits = [ h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict ] for h in hits: h = ["" if i == "" else i for i in h] #print "h in hits", h if not any( regex.match(h[2]) for regex in exclude_post_context) and not any( regex.match(h[0]) for regex in exclude_pre_context): tagged = pos_tag(h) #print tagged #if h[2] not in [" "]: #print tagged, os.path.join(input_path, pati, fil) #print inputad h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1 h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1 #taking out trash if ((tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) or (tagged[0][1] == "IN" and h[0] not in ["out", "like"]) or (tagged[0][1] in ["VBG"] and h[0] not in [ "talking", "responding", "waiting", "getting", "looking", "going", "trying" ]) or (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"]) or #this is where we screw up (tagged[2][1] in ["NNS"] and h[2] not in [ "chat", "kiss", "go", "know", "find", "do", "c", "knees" ]) or (tagged[2][1] == "IN") or (tagged[2][1] == "CC" and h[2] not in ["but"]) or # #we don't need this if we are to just ignore whatever goes thru all of it # #TEMPTEMPTEMP (h[0] in [ "be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice" ]) or (h[2] in [ "face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom" ]) or (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])): #print "killed",tagged, "\n" pass # # #finding the good elif ((tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"]) or (tagged[2][1] in ["JJ"] and h[0] in ["opposed"]) or (tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1])) or (h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"]) or (h[0] == "like" and h[2] not in ["furry", "cuz", "straight"]) or (h[0] in ["here"] and nounregex.match(tagged[2][1])) or #really what we are exluding here is anything non-Verb or Noun # # we can consider replacing this with a regex (h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"]) or (h[0] in [ "momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants" ] + [ "talking", "responding", "waiting", "getting", "looking", "lookin", "going", "trying" ]) or (h[2] in [ "survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host", "kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k" ])): print "hooked the plusloop", tagged #print tagged results.append(tagged) h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 else: pass if tagged[2][ 1]: #=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]: #print tagged #print "elseloop", tagged h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1 h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1 print "We have {} items with a token count of {}".format( len(h0dict.keys()), sum(h0dict.values())) h0dict = {k: v for k, v in h0dict.items() if v > 0} print "\n\n", number, "\npretext here be the results\n\n" print "\n".join([ ": ".join([ k, unicode(h0dict[k]), ".".join( word2vecwordfinder([ k ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json' )) ]) for k in sorted(h0dict, key=h0dict.get, reverse=True) ]) print "\n\n", number, "\nposttext here be the results\n\n" print "\n".join([ ": ".join([ k, unicode(h2dict[k]), ".".join( word2vecwordfinder([ k ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json' )) ]) for k in sorted(h2dict, key=h2dict.get, reverse=True) ]) print "We have {} post items with a token count of {}".format( len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format( len(h0dict.keys()), sum(h0dict.values())) return results
def prosodycounter(input_dir): """ Returns a list of lists where each list contains raw and per word counts. """ start = time.time() #creating the search terms prosodyitems = [ "\s(\*(?:laugh|cough|smack|giggle)\*)\s", "\W([Ee][Rr])\W", "\W((?:[Hh][Aa]){1,}[Hh]?)\W", "\W((?:[Hh][Uu]){1,}[Hh]?)\W", "\W((?:[Hh][Ee]){2,}[Hh]?)\W", "\W([Hh][Oo]{2,})\W", "\W([Hh][Mm]{1,})\W", "\W([Hh]e+y{2,})\W", "\W([Hh]e{2,}[Yy]+)\W", "\W" + anyoftheseregex("[Hh]+[Ee]+[Ll][Ll]+[Oo]+") + "\W", "\W([Mm]{2,}[Hh]?)\W", "\W((?:[Mm][Hh]){1,})\W", "\W([Ss][Oo]{2,})\W", "\W([Uu][Hh]+)\W", "\W([Uu][Mm]+)\W", "\W([Yy][Aa]+[Yy]+)\W", "\W([Yy]+[Aa]+[Hh]?)\W" ] excludelist = [] #dicts to store results dicti = defaultdict(float) matchesdicti = defaultdict(list) results = [] prosody_list = [re.compile(i) for i in prosodyitems] print "{} items in the prosody_list, {} unique".format( len(prosody_list), len(set(prosody_list))) print[i.pattern for i in prosody_list] #iterate and match for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]: print dir for fili in [ i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".") ]: with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext: inputad = ct.adtextextractor(inputtext.read(), fili).lower() #result is a list of lists which contain matches for each regex/acronym wordcount = float(len(ct.tokenizer(inputad))) result = [([m for m in i.findall(inputad) if not m in excludelist], i.pattern) for i in prosody_list] #print result results.append([(len(matches), len(matches) / wordcount) for matches, pattern in result]) for matches, pattern in result: #print pattern #the dicti is {pattern:count, pattern: count, ...} dicti[pattern] = dicti[pattern] + len(matches) matchesdicti[pattern] = matchesdicti[pattern] + matches print "\n".join([ ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True) ]) end = time.time() print "This took us {} minutes".format((end - start) / 60) # for u in [[x[0] for x in i] for i in results]: # print u print "shape of results, number of lists:", len( results), "-- length of lists", set([len(i) for i in results]) return [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]