コード例 #1
0
def vec2wordclustercounter(folderlist, cluster_dictionary):
    """
	This is stolen from the cluster_analysis dictmaker. 
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	remove_stopwords uses the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	This was mainly used to test how well the counting in the word2vec analysis works.
	"""
    with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson:
        clusterdict = json.load(inputjson)
    result = defaultdict(int)
    #this is just for qc
    misses = []
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        print "Building vocab: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputtext = ct.adtextextractor(inputfile, fili)
            #pre-processing here
            inputtext = ct.adcleaner(inputtext,
                                     replace_linebreak=True,
                                     remove_html=False)
            splittext = word_tokenize(inputtext)
            splittextlo = [i.lower() for i in splittext]
            finaltext = [punctuationregex.sub("", i) for i in splittextlo]
            finaltext = [i for i in finaltext if i and i not in ['br']]
            #do we want to lemmatize or things like that
            for word in finaltext:
                cluster = [
                    k for k, v in clusterdict.items() if word in v['words']
                ]
                if len(cluster) > 1:
                    print "Warning: The item {} was found in more than one clusters".format(
                        word)
                if len(cluster) < 1:
                    #print "Warning: The item could not be found in a cluster"
                    misses.append(word)
                else:
                    result[cluster[0]] = result[cluster[0]] + 1
    print "Our vocab dictionary has {} entries".format(len(result))
    ct.dictwriter(
        os.path.join("~/", chapterdir[0], "outputfiles",
                     "fulldict_" + time.strftime("%H_%M_%m_%d")), result)
    # 	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
    # 	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
    # 	print "This is our featuredict", featuredict
    # 	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
    print "misses", len(misses), set(misses)
    print result
    return result
コード例 #2
0
def vec2wordclustercounter(folderlist, cluster_dictionary):
	"""
	This is stolen from the cluster_analysis dictmaker. 
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	remove_stopwords uses the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	This was mainly used to test how well the counting in the word2vec analysis works.
	"""
	with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson:
		clusterdict=json.load(inputjson)
	result=defaultdict(int)
	#this is just for qc
	misses=[]
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")]
		print "Building vocab: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			#pre-processing here
			inputtext=ct.adcleaner(inputtext ,replace_linebreak=True, remove_html=False)
			splittext=word_tokenize(inputtext)
			splittextlo=[i.lower() for i in splittext]	
			finaltext=[punctuationregex.sub("",i) for i in splittextlo]
			finaltext=[i for i in finaltext if i and i not in ['br']]	
			#do we want to lemmatize or things like that
			for word in finaltext:
				cluster= [k for k,v in clusterdict.items() if word in v['words']]
				if len(cluster) > 1:
					print "Warning: The item {} was found in more than one clusters".format(word)
				if len(cluster) < 1:
					#print "Warning: The item could not be found in a cluster"
					misses.append(word)
				else:
					result[cluster[0]]=result[cluster[0]]+1
	print "Our vocab dictionary has {} entries".format(len(result))
	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), result)
# 	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
# 	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
# 	print "This is our featuredict", featuredict
# 	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
	print "misses", len(misses), set(misses)
	print result
	return result
コード例 #3
0
def rebusfinder_too(input_path):
	"""
	The rebus_too finder.
	It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count 
	instances where a writer uses "2" instead of "too". 
	"""
	predict=defaultdict(int)
	postdict=defaultdict(int)
	
	for number in [2]:
		results=[]
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					
					if (
									
					#unique items catcher
					(pre[0] in ["date"]) 
					or
					(pre[0] in ["it"] and post[0] in ["i"])
					or
					(pre[0] in ["cook"] and post[0] in ["im"])
					or
					(pre[0] in ["kids"] and post[0] in ["young"]) 
					or
					(pre[0] in ["life", "way"] and post[0] in ["short"])
					or
					(pre[0] in ["that"] and post[0] in ["hard"])
					or
					(pre[0] in ["real"] and post[0] in ["hope"])
					or
					(pre[0] in ["me"] and post[0] in ["if"])
					or
					(pre[0] in ["dogs"] and post[0] in ["if"])
					or
					(pre[0] in ["can"] and post[0] in ["but"])
					or
					(pre[0] in ["kool"] and not post[0] in ["even"])
					or
					(pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later"))
					or
					(pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"])
					or
					(pre[0] in ["real"] and post[0] in ["have"])
					or
					#BIGGER NETS
					#you be too in front of punctuation catch
					(pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"])
					or
					#this is if we know the pre-word and 2 is followed by punctuation
					# cf 'intellectualy ability 2. '
					(pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"])
					or
					#this is if we know the word to follow
					# cf 'not 2 late.' collected in postwords
					(post[0] in postwords)
					or
					#this is if we know the word to precede
					(pre[0] in prewords)
					):
					
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						results.append((pre, number, punct, post, os.path.join(input_path, pati, fil)))
						predict[pre[0]]=predict[pre[0]]+1
						postdict[post[0]]=postdict[post[0]]+1
		print "original result list is", len(results)
		seti=set(results)
		print "\n\n", seti
		print "the set is ", len(seti)
		overlap={k:results.count(k) for k in seti}
		print overlap
		print {k:overlap[k] for k in overlap if overlap[k] > 1}
		print "PRE CONTEXT"
		print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)])
		print "POST CONTEXT"
		print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])
コード例 #4
0
def rebusfinder_too(input_path, number_dictionary):
    """
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
    for number in number_dictionary.keys():
        #this is for comments to self
        print "PRE"

        #this is the regular expression to identify instances of the number studied
        numberregex = re.compile("\W([a-z]+)\s*(" + punctuationregex +
                                 ")?\s*(" + unicode(number) + ")(?:\s+)?(" +
                                 punctuationregex + ")?(?:\s+)?([a-z]+)\W")
        print numberregex.pattern
        #dicts to store statistics about context of number
        h0dict = defaultdict(int)
        h2dict = defaultdict(int)
        #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
        previous_patterns = []
        results = []
        for pati in [
                i for i in os.listdir(input_path) if not i.startswith(".")
        ]:
            for fil in [
                    i for i in os.listdir(os.path.join(input_path, pati))
                    if not i.startswith(".")
            ]:
                fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                                   "utf-8")
                inputad = ct.adtextextractor(fili.read(), fil)
                inputad = ct.adcleaner(inputad, replace_linebreak=True)
                inputad = inputad.lower()
                hits = numberregex.findall(inputad)
                #this weeds out all the phonenumbers.
                hits = [
                    h for h in hits if h[0] not in writtennumberdict
                    and h[2] not in writtennumberdict
                ]
                for h in hits:
                    h = ["" if i == "" else i for i in h]
                    #print "h in hits", h
                    if not any(
                            regex.match(h[2])
                            for regex in exclude_post_context) and not any(
                                regex.match(h[0])
                                for regex in exclude_pre_context):
                        tagged = pos_tag(h)
                        #print tagged
                        #if h[2] not in [" "]:
                        #print tagged, os.path.join(input_path, pati, fil)
                        #print inputad
                        h0dict[h[0]] = h0dict[h[0]] + 1
                        h2dict[h[2]] = h2dict[h[2]] + 1
                        h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1
                        h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1
                        #taking out trash
                        if ((tagged[0][1] in ["DT", "JJS", "TO", "PRP$"])
                                or (tagged[0][1] == "IN"
                                    and h[0] not in ["out", "like"]) or
                            (tagged[0][1] in ["VBG"] and h[0] not in [
                                "talking", "responding", "waiting", "getting",
                                "looking", "going", "trying"
                            ]) or (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"]
                                   and tagged[2][1] in ["JJ"]) or
                                #this is where we screw up
                            (tagged[2][1] in ["NNS"] and h[2] not in [
                                "chat", "kiss", "go", "know", "find", "do",
                                "c", "knees"
                            ]) or (tagged[2][1] == "IN") or
                            (tagged[2][1] == "CC" and h[2] not in ["but"]) or
                                # 							#we don't need this if we are to just ignore whatever goes thru all of it
                                # 							#TEMPTEMPTEMP
                            (h[0] in [
                                "be", "other", "s", "type", "was", "work",
                                "im", "baths", "you", "maybe", "big", "day",
                                "o", "round", "ride", "avengers", "kids",
                                "had", "number", "have", "like", "here",
                                "size", "got", "are", "send", "only", "have",
                                "go", "is", "bedroom", "but", "beautiful",
                                "nice"
                            ]) or (h[2] in [
                                "face", "new", "faced", "wonderful", "must",
                                "min", "short", "si", "br", "step", "start",
                                "so", "out", "story", "bdrm", "other", "out",
                                "story", "yr", "looking", "more", "but", "hrs",
                                "bedroom"
                            ]) or
                            (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])):
                            #print "killed",tagged, "\n"
                            pass


#
# 						#finding the good
                        elif ((tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
                              or
                              (tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
                              or (tagged[2][1] in ["PRP"]
                                  and not nounregex.match(tagged[0][1]))
                              or (h[0] == "have" and h[2]
                                  in ["browse", "force", "go", "send", "talk"])
                              or (h[0] == "like"
                                  and h[2] not in ["furry", "cuz", "straight"])
                              or (h[0] in ["here"]
                                  and nounregex.match(tagged[2][1])) or
                              #really what we are exluding here is anything non-Verb or Noun
                              # 							# we can consider replacing this with a regex
                              (h[0] in ["need", "me", "pics"]
                               and tagged[2][1] not in ["JJ", "JJR", "MD"])
                              or (h[0] in [
                                  "momma", "women", "delighted", "tryn",
                                  "respond", "travel", "veldkum", "happness",
                                  "pool", "lots", "bbw", "willin", "luvz",
                                  "place", "time", "married", "pixs", "boy",
                                  "pictures", "brickz", "somebody", "memphis",
                                  "cell", "fear", "hoop", "open", "goes",
                                  "afraid", "speak", "lady", "needs",
                                  "attracted", "doms", "bottom", "head",
                                  "apply", "drive", "pic", "newer", "pinned",
                                  "luvs", "sumbody", "face", "due", "tryin",
                                  "line", "has", "close", "interested", "alot",
                                  "oral", "talk", "new", "girl", "up",
                                  "scared", "willing", "cam", "loves", "c**k",
                                  "out", "u", "nice", "how", "free", "hard",
                                  "hope", "able", "someone", "man", "woman",
                                  "male", "down", "love", "luv", "ready",
                                  "want", "wants"
                              ] + [
                                  "talking", "responding", "waiting",
                                  "getting", "looking", "lookin", "going",
                                  "trying"
                              ]) or (h[2] in [
                                  "survive", "brag", "blow", "grab", "feel",
                                  "send", "connect", "hearing", "say", "read",
                                  "contact", "please", "run", "host", "kno",
                                  "talk", "just", "add", "text", "chill",
                                  "hang", "date", "find", "chat", "show", "u",
                                  "meet", "her", "hear", "me", "my", "b",
                                  "know", "play", "do", "suck", "go", "get",
                                  "f**k"
                              ])):
                            print "hooked the plusloop", tagged
                            #print tagged
                            results.append(tagged)
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
                        else:
                            pass
                        if tagged[2][
                                1]:  #=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
                            #print tagged
                            #print "elseloop", tagged
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
                            h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1
                            h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1

        print "We have {} items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        h0dict = {k: v for k, v in h0dict.items() if v > 0}
        print "\n\n", number, "\npretext here be the results\n\n"
        print "\n".join([
            ": ".join([
                k,
                unicode(h0dict[k]), ".".join(
                    word2vecwordfinder([
                        k
                    ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'
                                       ))
            ]) for k in sorted(h0dict, key=h0dict.get, reverse=True)
        ])
        print "\n\n", number, "\nposttext here be the results\n\n"
        print "\n".join([
            ": ".join([
                k,
                unicode(h2dict[k]), ".".join(
                    word2vecwordfinder([
                        k
                    ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'
                                       ))
            ]) for k in sorted(h2dict, key=h2dict.get, reverse=True)
        ])

        print "We have {} post items with a token count of {}".format(
            len(h2dict.keys()), sum(h2dict.values()))
        print "We have {} pre items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        return results
コード例 #5
0
def rebusfinder_too(input_path, number_dictionary):
	"""
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
	for number in number_dictionary.keys():
		#this is for comments to self
		print "PRE"
		
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					if (post[1] in ["NNS"]) and (punct[0] in [" "]):
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						search_pattern=[re.escape(i) for i in [pre[0],number[0], punct[0], post[0]]]
						if search_pattern not in previous_patterns:
							tk.tokenfinder(["\s*".join(search_pattern)], dir)
							previous_patterns.append(search_pattern)
						else:
							print "SEE TOKENFINDER RESULTS ABOVE\n"			
						#error catching here 
						#
				
				
				
				# for h in hits:
# 					if h[2]:#==".":
# 						print  h, os.path.join(input_path, pati, fil)
# 						print pos_tag(h), "\n"
						
					#if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context):
						#tagged=pos_tag(h), fil
						#print tagged
						#if h[2] not in [" "]:
						#	print tagged, os.path.join(input_path, pati, fil)
							#print inputad
						#h0dict[h[0]]=h0dict[h[0]]+1
 						#h2dict[h[2]]=h2dict[h[2]]+1
						#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
						#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
						#taking out trash
						# if (
# 							(tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) 
# 							or
# 							(tagged[0][1]=="IN" and h[0] not in ["out", "like"])
# 							or
# 							(tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"])
# 							or
# 							(tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"])
# 							or
# 							#this is where we screw up
# 							(tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"])
# 							or
# 							(tagged[2][1]=="IN")
# 							or
# 							(tagged[2][1]=="CC" and h[2] not in ["but"])
# 							or
# 							#we don't need this if we are to just ignore whatever goes thru all of it
# 							#TEMPTEMPTEMP
# 							(h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"])
# 							or
# 							(h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"])
# 							or 
# 							(tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])
# 							):
# 							#print "killed",tagged, "\n"
# 							pass
# 						
# 						#finding the good
# 						elif (
# 							(tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
# 							or
# 							(tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
# 							or
# 							(tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1]))
# 							or
# 							(h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"])
# 							or
# 							(h[0] == "like" and h[2] not in ["furry", "cuz", "straight"])
# 							or
# 							(h[0] in ["here"] and nounregex.match(tagged[2][1]))
# 							or
# 							#really what we are exluding here is anything non-Verb or Noun
# 							# we can consider replacing this with a regex
# 							(h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"])
# 							or 
# 							(h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"])
# 							or
# 							(h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"])
# 							):
# 							#print "hooked the plusloop", tagged
# 							print tagged
# 							results.append(tagged)
# 							h0dict[h[0]]=h0dict[h[0]]+1
#  							h2dict[h[2]]=h2dict[h[2]]+1
# 						else:
# 							pass
							#if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
							#	print tagged
								#print "elseloop", tagged
# 								h0dict[h[0]]=h0dict[h[0]]+1
# 								h2dict[h[2]]=h2dict[h[2]]+1
								#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
								#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
									
	
		
		print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		h0dict={k:v for k,v in h0dict.items() if v > 0}
		print "\n\n", number, "\npretext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
		print "\n\n", number, "\nposttext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)])

		print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values()))
		print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		return results
コード例 #6
0
def rebusfinder_too(input_path):
	"""
	The rebus_too finder.
	It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count 
	instances where a writer uses "2" instead of "too". 
	"""
	predict=defaultdict(int)
	postdict=defaultdict(int)
	
	for number in [2]:
		results=[]
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					
					if (
									
					#unique items catcher
					(pre[0] in ["date"]) 
					or
					(pre[0] in ["it"] and post[0] in ["i"])
					or
					(pre[0] in ["cook"] and post[0] in ["im"])
					or
					(pre[0] in ["kids"] and post[0] in ["young"]) 
					or
					(pre[0] in ["life", "way"] and post[0] in ["short"])
					or
					(pre[0] in ["that"] and post[0] in ["hard"])
					or
					(pre[0] in ["real"] and post[0] in ["hope"])
					or
					(pre[0] in ["me"] and post[0] in ["if"])
					or
					(pre[0] in ["dogs"] and post[0] in ["if"])
					or
					(pre[0] in ["can"] and post[0] in ["but"])
					or
					(pre[0] in ["kool"] and not post[0] in ["even"])
					or
					(pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later"))
					or
					(pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"])
					or
					(pre[0] in ["real"] and post[0] in ["have"])
					or
					#BIGGER NETS
					#you be too in front of punctuation catch
					(pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"])
					or
					#this is if we know the pre-word and 2 is followed by punctuation
					# cf 'intellectualy ability 2. '
					(pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"])
					or
					#this is if we know the word to follow
					# cf 'not 2 late.' collected in postwords
					(post[0] in postwords)
					or
					#this is if we know the word to precede
					(pre[0] in prewords)
					):
					
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						results.append((pre, number, punct, post, os.path.join(input_path, pati, fil)))
						predict[pre[0]]=predict[pre[0]]+1
						postdict[post[0]]=postdict[post[0]]+1
		print "original result list is", len(results)
		seti=set(results)
		print "\n\n", seti
		print "the set is ", len(seti)
		overlap={k:results.count(k) for k in seti}
		print overlap
		print {k:overlap[k] for k in overlap if overlap[k] > 1}
		print "PRE CONTEXT"
		print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)])
		print "POST CONTEXT"
		print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])