def vec2wordclustercounter(folderlist, cluster_dictionary): """ This is stolen from the cluster_analysis dictmaker. The dictmaker counts the words / items contained in the files found in the folders of folderlist. remove_stopwords uses the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. This was mainly used to test how well the counting in the word2vec analysis works. """ with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson: clusterdict = json.load(inputjson) result = defaultdict(int) #this is just for qc misses = [] for folder in folderlist: filis = [ i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".") ] print "Building vocab: we have {} files in folder {}".format( len(filis), folder) for fili in filis: inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext = ct.adtextextractor(inputfile, fili) #pre-processing here inputtext = ct.adcleaner(inputtext, replace_linebreak=True, remove_html=False) splittext = word_tokenize(inputtext) splittextlo = [i.lower() for i in splittext] finaltext = [punctuationregex.sub("", i) for i in splittextlo] finaltext = [i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: cluster = [ k for k, v in clusterdict.items() if word in v['words'] ] if len(cluster) > 1: print "Warning: The item {} was found in more than one clusters".format( word) if len(cluster) < 1: #print "Warning: The item could not be found in a cluster" misses.append(word) else: result[cluster[0]] = result[cluster[0]] + 1 print "Our vocab dictionary has {} entries".format(len(result)) ct.dictwriter( os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_" + time.strftime("%H_%M_%m_%d")), result) # featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } # print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) # print "This is our featuredict", featuredict # ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) print "misses", len(misses), set(misses) print result return result
def vec2wordclustercounter(folderlist, cluster_dictionary): """ This is stolen from the cluster_analysis dictmaker. The dictmaker counts the words / items contained in the files found in the folders of folderlist. remove_stopwords uses the stopword list defined above to ignore words. remove_punct works with string.punctuation, cf above. This was mainly used to test how well the counting in the word2vec analysis works. """ with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson: clusterdict=json.load(inputjson) result=defaultdict(int) #this is just for qc misses=[] for folder in folderlist: filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")] print "Building vocab: we have {} files in folder {}".format(len(filis), folder) for fili in filis: inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read() inputtext=ct.adtextextractor(inputfile, fili) #pre-processing here inputtext=ct.adcleaner(inputtext ,replace_linebreak=True, remove_html=False) splittext=word_tokenize(inputtext) splittextlo=[i.lower() for i in splittext] finaltext=[punctuationregex.sub("",i) for i in splittextlo] finaltext=[i for i in finaltext if i and i not in ['br']] #do we want to lemmatize or things like that for word in finaltext: cluster= [k for k,v in clusterdict.items() if word in v['words']] if len(cluster) > 1: print "Warning: The item {} was found in more than one clusters".format(word) if len(cluster) < 1: #print "Warning: The item could not be found in a cluster" misses.append(word) else: result[cluster[0]]=result[cluster[0]]+1 print "Our vocab dictionary has {} entries".format(len(result)) ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), result) # featuredict= {key:value for key, value in vocab.items() if value > float(threshold) } # print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict)) # print "This is our featuredict", featuredict # ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict) print "misses", len(misses), set(misses) print result return result
def rebusfinder_too(input_path): """ The rebus_too finder. It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count instances where a writer uses "2" instead of "too". """ predict=defaultdict(int) postdict=defaultdict(int) for number in [2]: results=[] #this is the regular expression to identify instances of the number studied numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W") print numberregex.pattern #dicts to store statistics about context of number h0dict=defaultdict(int) h2dict=defaultdict(int) #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output previous_patterns=[] results=[] for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=ct.adcleaner(inputad, replace_linebreak=True) inputad=inputad.lower() hits=numberregex.findall(inputad) #this weeds out all the phonenumbers. hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict] for h in hits: #this is needed for instance where there is no punctuation h=[" " if i == "" else i for i in h] """ thus [(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')] pre, "2", optional punctuation, post """ [pre, pre_punct, number, punct, post]=pos_tag(h) if ( #unique items catcher (pre[0] in ["date"]) or (pre[0] in ["it"] and post[0] in ["i"]) or (pre[0] in ["cook"] and post[0] in ["im"]) or (pre[0] in ["kids"] and post[0] in ["young"]) or (pre[0] in ["life", "way"] and post[0] in ["short"]) or (pre[0] in ["that"] and post[0] in ["hard"]) or (pre[0] in ["real"] and post[0] in ["hope"]) or (pre[0] in ["me"] and post[0] in ["if"]) or (pre[0] in ["dogs"] and post[0] in ["if"]) or (pre[0] in ["can"] and post[0] in ["but"]) or (pre[0] in ["kool"] and not post[0] in ["even"]) or (pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later")) or (pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"]) or (pre[0] in ["real"] and post[0] in ["have"]) or #BIGGER NETS #you be too in front of punctuation catch (pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"]) or #this is if we know the pre-word and 2 is followed by punctuation # cf 'intellectualy ability 2. ' (pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"]) or #this is if we know the word to follow # cf 'not 2 late.' collected in postwords (post[0] in postwords) or #this is if we know the word to precede (pre[0] in prewords) ): print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil) results.append((pre, number, punct, post, os.path.join(input_path, pati, fil))) predict[pre[0]]=predict[pre[0]]+1 postdict[post[0]]=postdict[post[0]]+1 print "original result list is", len(results) seti=set(results) print "\n\n", seti print "the set is ", len(seti) overlap={k:results.count(k) for k in seti} print overlap print {k:overlap[k] for k in overlap if overlap[k] > 1} print "PRE CONTEXT" print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)]) print "POST CONTEXT" print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])
def rebusfinder_too(input_path, number_dictionary): """ This finds words that are represented as numbers. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre and exclude_post word for negative contexts in 4. It prints the results and give type and token counts. """ for number in number_dictionary.keys(): #this is for comments to self print "PRE" #this is the regular expression to identify instances of the number studied numberregex = re.compile("\W([a-z]+)\s*(" + punctuationregex + ")?\s*(" + unicode(number) + ")(?:\s+)?(" + punctuationregex + ")?(?:\s+)?([a-z]+)\W") print numberregex.pattern #dicts to store statistics about context of number h0dict = defaultdict(int) h2dict = defaultdict(int) #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output previous_patterns = [] results = [] for pati in [ i for i in os.listdir(input_path) if not i.startswith(".") ]: for fil in [ i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".") ]: fili = codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad = ct.adtextextractor(fili.read(), fil) inputad = ct.adcleaner(inputad, replace_linebreak=True) inputad = inputad.lower() hits = numberregex.findall(inputad) #this weeds out all the phonenumbers. hits = [ h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict ] for h in hits: h = ["" if i == "" else i for i in h] #print "h in hits", h if not any( regex.match(h[2]) for regex in exclude_post_context) and not any( regex.match(h[0]) for regex in exclude_pre_context): tagged = pos_tag(h) #print tagged #if h[2] not in [" "]: #print tagged, os.path.join(input_path, pati, fil) #print inputad h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1 h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1 #taking out trash if ((tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) or (tagged[0][1] == "IN" and h[0] not in ["out", "like"]) or (tagged[0][1] in ["VBG"] and h[0] not in [ "talking", "responding", "waiting", "getting", "looking", "going", "trying" ]) or (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"]) or #this is where we screw up (tagged[2][1] in ["NNS"] and h[2] not in [ "chat", "kiss", "go", "know", "find", "do", "c", "knees" ]) or (tagged[2][1] == "IN") or (tagged[2][1] == "CC" and h[2] not in ["but"]) or # #we don't need this if we are to just ignore whatever goes thru all of it # #TEMPTEMPTEMP (h[0] in [ "be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice" ]) or (h[2] in [ "face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom" ]) or (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])): #print "killed",tagged, "\n" pass # # #finding the good elif ((tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"]) or (tagged[2][1] in ["JJ"] and h[0] in ["opposed"]) or (tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1])) or (h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"]) or (h[0] == "like" and h[2] not in ["furry", "cuz", "straight"]) or (h[0] in ["here"] and nounregex.match(tagged[2][1])) or #really what we are exluding here is anything non-Verb or Noun # # we can consider replacing this with a regex (h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"]) or (h[0] in [ "momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants" ] + [ "talking", "responding", "waiting", "getting", "looking", "lookin", "going", "trying" ]) or (h[2] in [ "survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host", "kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k" ])): print "hooked the plusloop", tagged #print tagged results.append(tagged) h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 else: pass if tagged[2][ 1]: #=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]: #print tagged #print "elseloop", tagged h0dict[h[0]] = h0dict[h[0]] + 1 h2dict[h[2]] = h2dict[h[2]] + 1 h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1 h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1 print "We have {} items with a token count of {}".format( len(h0dict.keys()), sum(h0dict.values())) h0dict = {k: v for k, v in h0dict.items() if v > 0} print "\n\n", number, "\npretext here be the results\n\n" print "\n".join([ ": ".join([ k, unicode(h0dict[k]), ".".join( word2vecwordfinder([ k ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json' )) ]) for k in sorted(h0dict, key=h0dict.get, reverse=True) ]) print "\n\n", number, "\nposttext here be the results\n\n" print "\n".join([ ": ".join([ k, unicode(h2dict[k]), ".".join( word2vecwordfinder([ k ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json' )) ]) for k in sorted(h2dict, key=h2dict.get, reverse=True) ]) print "We have {} post items with a token count of {}".format( len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format( len(h0dict.keys()), sum(h0dict.values())) return results
def rebusfinder_too(input_path, number_dictionary): """ This finds words that are represented as numbers. All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified. The lists exclude_pre and exclude_post word for negative contexts in 4. It prints the results and give type and token counts. """ for number in number_dictionary.keys(): #this is for comments to self print "PRE" #this is the regular expression to identify instances of the number studied numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W") print numberregex.pattern #dicts to store statistics about context of number h0dict=defaultdict(int) h2dict=defaultdict(int) #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output previous_patterns=[] results=[] for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]: for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]: fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8") inputad=ct.adtextextractor(fili.read(), fil) inputad=ct.adcleaner(inputad, replace_linebreak=True) inputad=inputad.lower() hits=numberregex.findall(inputad) #this weeds out all the phonenumbers. hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict] for h in hits: #this is needed for instance where there is no punctuation h=[" " if i == "" else i for i in h] """ thus [(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')] pre, "2", optional punctuation, post """ [pre, pre_punct, number, punct, post]=pos_tag(h) if (post[1] in ["NNS"]) and (punct[0] in [" "]): print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil) search_pattern=[re.escape(i) for i in [pre[0],number[0], punct[0], post[0]]] if search_pattern not in previous_patterns: tk.tokenfinder(["\s*".join(search_pattern)], dir) previous_patterns.append(search_pattern) else: print "SEE TOKENFINDER RESULTS ABOVE\n" #error catching here # # for h in hits: # if h[2]:#==".": # print h, os.path.join(input_path, pati, fil) # print pos_tag(h), "\n" #if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context): #tagged=pos_tag(h), fil #print tagged #if h[2] not in [" "]: # print tagged, os.path.join(input_path, pati, fil) #print inputad #h0dict[h[0]]=h0dict[h[0]]+1 #h2dict[h[2]]=h2dict[h[2]]+1 #h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1 #h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1 #taking out trash # if ( # (tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) # or # (tagged[0][1]=="IN" and h[0] not in ["out", "like"]) # or # (tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"]) # or # (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"]) # or # #this is where we screw up # (tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"]) # or # (tagged[2][1]=="IN") # or # (tagged[2][1]=="CC" and h[2] not in ["but"]) # or # #we don't need this if we are to just ignore whatever goes thru all of it # #TEMPTEMPTEMP # (h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"]) # or # (h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"]) # or # (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"]) # ): # #print "killed",tagged, "\n" # pass # # #finding the good # elif ( # (tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"]) # or # (tagged[2][1] in ["JJ"] and h[0] in ["opposed"]) # or # (tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1])) # or # (h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"]) # or # (h[0] == "like" and h[2] not in ["furry", "cuz", "straight"]) # or # (h[0] in ["here"] and nounregex.match(tagged[2][1])) # or # #really what we are exluding here is anything non-Verb or Noun # # we can consider replacing this with a regex # (h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"]) # or # (h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"]) # or # (h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"]) # ): # #print "hooked the plusloop", tagged # print tagged # results.append(tagged) # h0dict[h[0]]=h0dict[h[0]]+1 # h2dict[h[2]]=h2dict[h[2]]+1 # else: # pass #if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]: # print tagged #print "elseloop", tagged # h0dict[h[0]]=h0dict[h[0]]+1 # h2dict[h[2]]=h2dict[h[2]]+1 #h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1 #h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1 print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) h0dict={k:v for k,v in h0dict.items() if v > 0} print "\n\n", number, "\npretext here be the results\n\n" print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)]) print "\n\n", number, "\nposttext here be the results\n\n" print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)]) print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values())) print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values())) return results