Python adcleaner Examples

Programming Language: Python

Namespace/Package Name: clustertools

Method/Function: adcleaner

Examples at hotexamples.com: 6

Python adcleaner - 6 examples found. These are the top rated real world Python examples of clustertools.adcleaner extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def vec2wordclustercounter(folderlist, cluster_dictionary):
    """
	This is stolen from the cluster_analysis dictmaker. 
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	remove_stopwords uses the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	This was mainly used to test how well the counting in the word2vec analysis works.
	"""
    with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson:
        clusterdict = json.load(inputjson)
    result = defaultdict(int)
    #this is just for qc
    misses = []
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        print "Building vocab: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputtext = ct.adtextextractor(inputfile, fili)
            #pre-processing here
            inputtext = ct.adcleaner(inputtext,
                                     replace_linebreak=True,
                                     remove_html=False)
            splittext = word_tokenize(inputtext)
            splittextlo = [i.lower() for i in splittext]
            finaltext = [punctuationregex.sub("", i) for i in splittextlo]
            finaltext = [i for i in finaltext if i and i not in ['br']]
            #do we want to lemmatize or things like that
            for word in finaltext:
                cluster = [
                    k for k, v in clusterdict.items() if word in v['words']
                ]
                if len(cluster) > 1:
                    print "Warning: The item {} was found in more than one clusters".format(
                        word)
                if len(cluster) < 1:
                    #print "Warning: The item could not be found in a cluster"
                    misses.append(word)
                else:
                    result[cluster[0]] = result[cluster[0]] + 1
    print "Our vocab dictionary has {} entries".format(len(result))
    ct.dictwriter(
        os.path.join("~/", chapterdir[0], "outputfiles",
                     "fulldict_" + time.strftime("%H_%M_%m_%d")), result)
    # 	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
    # 	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
    # 	print "This is our featuredict", featuredict
    # 	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
    print "misses", len(misses), set(misses)
    print result
    return result

Example #2

Show file

File: vec2wordclustercounter.py Project: patrickschu/chapter2

def vec2wordclustercounter(folderlist, cluster_dictionary):
	"""
	This is stolen from the cluster_analysis dictmaker. 
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	remove_stopwords uses the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	This was mainly used to test how well the counting in the word2vec analysis works.
	"""
	with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson:
		clusterdict=json.load(inputjson)
	result=defaultdict(int)
	#this is just for qc
	misses=[]
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")]
		print "Building vocab: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			#pre-processing here
			inputtext=ct.adcleaner(inputtext ,replace_linebreak=True, remove_html=False)
			splittext=word_tokenize(inputtext)
			splittextlo=[i.lower() for i in splittext]	
			finaltext=[punctuationregex.sub("",i) for i in splittextlo]
			finaltext=[i for i in finaltext if i and i not in ['br']]	
			#do we want to lemmatize or things like that
			for word in finaltext:
				cluster= [k for k,v in clusterdict.items() if word in v['words']]
				if len(cluster) > 1:
					print "Warning: The item {} was found in more than one clusters".format(word)
				if len(cluster) < 1:
					#print "Warning: The item could not be found in a cluster"
					misses.append(word)
				else:
					result[cluster[0]]=result[cluster[0]]+1
	print "Our vocab dictionary has {} entries".format(len(result))
	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), result)
# 	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
# 	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
# 	print "This is our featuredict", featuredict
# 	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
	print "misses", len(misses), set(misses)
	print result
	return result

Example #3

Show file

File: rebusfinder_too_1108.py Project: patrickschu/chapter2

def rebusfinder_too(input_path):
	"""
	The rebus_too finder.
	It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count 
	instances where a writer uses "2" instead of "too". 
	"""
	predict=defaultdict(int)
	postdict=defaultdict(int)
	
	for number in [2]:
		results=[]
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					
					if (
									
					#unique items catcher
					(pre[0] in ["date"]) 
					or
					(pre[0] in ["it"] and post[0] in ["i"])
					or
					(pre[0] in ["cook"] and post[0] in ["im"])
					or
					(pre[0] in ["kids"] and post[0] in ["young"]) 
					or
					(pre[0] in ["life", "way"] and post[0] in ["short"])
					or
					(pre[0] in ["that"] and post[0] in ["hard"])
					or
					(pre[0] in ["real"] and post[0] in ["hope"])
					or
					(pre[0] in ["me"] and post[0] in ["if"])
					or
					(pre[0] in ["dogs"] and post[0] in ["if"])
					or
					(pre[0] in ["can"] and post[0] in ["but"])
					or
					(pre[0] in ["kool"] and not post[0] in ["even"])
					or
					(pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later"))
					or
					(pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"])
					or
					(pre[0] in ["real"] and post[0] in ["have"])
					or
					#BIGGER NETS
					#you be too in front of punctuation catch
					(pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"])
					or
					#this is if we know the pre-word and 2 is followed by punctuation
					# cf 'intellectualy ability 2. '
					(pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"])
					or
					#this is if we know the word to follow
					# cf 'not 2 late.' collected in postwords
					(post[0] in postwords)
					or
					#this is if we know the word to precede
					(pre[0] in prewords)
					):
					
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						results.append((pre, number, punct, post, os.path.join(input_path, pati, fil)))
						predict[pre[0]]=predict[pre[0]]+1
						postdict[post[0]]=postdict[post[0]]+1
		print "original result list is", len(results)
		seti=set(results)
		print "\n\n", seti
		print "the set is ", len(seti)
		overlap={k:results.count(k) for k in seti}
		print overlap
		print {k:overlap[k] for k in overlap if overlap[k] > 1}
		print "PRE CONTEXT"
		print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)])
		print "POST CONTEXT"
		print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])

Example #4

Show file

File: identifying_rebus_too_1022.py Project: patrickschu/chapter2

def rebusfinder_too(input_path, number_dictionary):
    """
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
    for number in number_dictionary.keys():
        #this is for comments to self
        print "PRE"

        #this is the regular expression to identify instances of the number studied
        numberregex = re.compile("\W([a-z]+)\s*(" + punctuationregex +
                                 ")?\s*(" + unicode(number) + ")(?:\s+)?(" +
                                 punctuationregex + ")?(?:\s+)?([a-z]+)\W")
        print numberregex.pattern
        #dicts to store statistics about context of number
        h0dict = defaultdict(int)
        h2dict = defaultdict(int)
        #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
        previous_patterns = []
        results = []
        for pati in [
                i for i in os.listdir(input_path) if not i.startswith(".")
        ]:
            for fil in [
                    i for i in os.listdir(os.path.join(input_path, pati))
                    if not i.startswith(".")
            ]:
                fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                                   "utf-8")
                inputad = ct.adtextextractor(fili.read(), fil)
                inputad = ct.adcleaner(inputad, replace_linebreak=True)
                inputad = inputad.lower()
                hits = numberregex.findall(inputad)
                #this weeds out all the phonenumbers.
                hits = [
                    h for h in hits if h[0] not in writtennumberdict
                    and h[2] not in writtennumberdict
                ]
                for h in hits:
                    h = ["" if i == "" else i for i in h]
                    #print "h in hits", h
                    if not any(
                            regex.match(h[2])
                            for regex in exclude_post_context) and not any(
                                regex.match(h[0])
                                for regex in exclude_pre_context):
                        tagged = pos_tag(h)
                        #print tagged
                        #if h[2] not in [" "]:
                        #print tagged, os.path.join(input_path, pati, fil)
                        #print inputad
                        h0dict[h[0]] = h0dict[h[0]] + 1
                        h2dict[h[2]] = h2dict[h[2]] + 1
                        h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1
                        h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1
                        #taking out trash
                        if ((tagged[0][1] in ["DT", "JJS", "TO", "PRP$"])
                                or (tagged[0][1] == "IN"
                                    and h[0] not in ["out", "like"]) or
                            (tagged[0][1] in ["VBG"] and h[0] not in [
                                "talking", "responding", "waiting", "getting",
                                "looking", "going", "trying"
                            ]) or (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"]
                                   and tagged[2][1] in ["JJ"]) or
                                #this is where we screw up
                            (tagged[2][1] in ["NNS"] and h[2] not in [
                                "chat", "kiss", "go", "know", "find", "do",
                                "c", "knees"
                            ]) or (tagged[2][1] == "IN") or
                            (tagged[2][1] == "CC" and h[2] not in ["but"]) or
                                # 							#we don't need this if we are to just ignore whatever goes thru all of it
                                # 							#TEMPTEMPTEMP
                            (h[0] in [
                                "be", "other", "s", "type", "was", "work",
                                "im", "baths", "you", "maybe", "big", "day",
                                "o", "round", "ride", "avengers", "kids",
                                "had", "number", "have", "like", "here",
                                "size", "got", "are", "send", "only", "have",
                                "go", "is", "bedroom", "but", "beautiful",
                                "nice"
                            ]) or (h[2] in [
                                "face", "new", "faced", "wonderful", "must",
                                "min", "short", "si", "br", "step", "start",
                                "so", "out", "story", "bdrm", "other", "out",
                                "story", "yr", "looking", "more", "but", "hrs",
                                "bedroom"
                            ]) or
                            (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])):
                            #print "killed",tagged, "\n"
                            pass


#
# 						#finding the good
                        elif ((tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
                              or
                              (tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
                              or (tagged[2][1] in ["PRP"]
                                  and not nounregex.match(tagged[0][1]))
                              or (h[0] == "have" and h[2]
                                  in ["browse", "force", "go", "send", "talk"])
                              or (h[0] == "like"
                                  and h[2] not in ["furry", "cuz", "straight"])
                              or (h[0] in ["here"]
                                  and nounregex.match(tagged[2][1])) or
                              #really what we are exluding here is anything non-Verb or Noun
                              # 							# we can consider replacing this with a regex
                              (h[0] in ["need", "me", "pics"]
                               and tagged[2][1] not in ["JJ", "JJR", "MD"])
                              or (h[0] in [
                                  "momma", "women", "delighted", "tryn",
                                  "respond", "travel", "veldkum", "happness",
                                  "pool", "lots", "bbw", "willin", "luvz",
                                  "place", "time", "married", "pixs", "boy",
                                  "pictures", "brickz", "somebody", "memphis",
                                  "cell", "fear", "hoop", "open", "goes",
                                  "afraid", "speak", "lady", "needs",
                                  "attracted", "doms", "bottom", "head",
                                  "apply", "drive", "pic", "newer", "pinned",
                                  "luvs", "sumbody", "face", "due", "tryin",
                                  "line", "has", "close", "interested", "alot",
                                  "oral", "talk", "new", "girl", "up",
                                  "scared", "willing", "cam", "loves", "c**k",
                                  "out", "u", "nice", "how", "free", "hard",
                                  "hope", "able", "someone", "man", "woman",
                                  "male", "down", "love", "luv", "ready",
                                  "want", "wants"
                              ] + [
                                  "talking", "responding", "waiting",
                                  "getting", "looking", "lookin", "going",
                                  "trying"
                              ]) or (h[2] in [
                                  "survive", "brag", "blow", "grab", "feel",
                                  "send", "connect", "hearing", "say", "read",
                                  "contact", "please", "run", "host", "kno",
                                  "talk", "just", "add", "text", "chill",
                                  "hang", "date", "find", "chat", "show", "u",
                                  "meet", "her", "hear", "me", "my", "b",
                                  "know", "play", "do", "suck", "go", "get",
                                  "f**k"
                              ])):
                            print "hooked the plusloop", tagged
                            #print tagged
                            results.append(tagged)
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
                        else:
                            pass
                        if tagged[2][
                                1]:  #=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
                            #print tagged
                            #print "elseloop", tagged
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
                            h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1
                            h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1

        print "We have {} items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        h0dict = {k: v for k, v in h0dict.items() if v > 0}
        print "\n\n", number, "\npretext here be the results\n\n"
        print "\n".join([
            ": ".join([
                k,
                unicode(h0dict[k]), ".".join(
                    word2vecwordfinder([
                        k
                    ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'
                                       ))
            ]) for k in sorted(h0dict, key=h0dict.get, reverse=True)
        ])
        print "\n\n", number, "\nposttext here be the results\n\n"
        print "\n".join([
            ": ".join([
                k,
                unicode(h2dict[k]), ".".join(
                    word2vecwordfinder([
                        k
                    ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'
                                       ))
            ]) for k in sorted(h2dict, key=h2dict.get, reverse=True)
        ])

        print "We have {} post items with a token count of {}".format(
            len(h2dict.keys()), sum(h2dict.values()))
        print "We have {} pre items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        return results

Example #5

Show file

File: identifying_rebus_too_1022.py Project: patrickschu/chapter2

def rebusfinder_too(input_path, number_dictionary):
	"""
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
	for number in number_dictionary.keys():
		#this is for comments to self
		print "PRE"
		
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					if (post[1] in ["NNS"]) and (punct[0] in [" "]):
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						search_pattern=[re.escape(i) for i in [pre[0],number[0], punct[0], post[0]]]
						if search_pattern not in previous_patterns:
							tk.tokenfinder(["\s*".join(search_pattern)], dir)
							previous_patterns.append(search_pattern)
						else:
							print "SEE TOKENFINDER RESULTS ABOVE\n"			
						#error catching here 
						#
				
				
				
				# for h in hits:
# 					if h[2]:#==".":
# 						print  h, os.path.join(input_path, pati, fil)
# 						print pos_tag(h), "\n"
						
					#if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context):
						#tagged=pos_tag(h), fil
						#print tagged
						#if h[2] not in [" "]:
						#	print tagged, os.path.join(input_path, pati, fil)
							#print inputad
						#h0dict[h[0]]=h0dict[h[0]]+1
 						#h2dict[h[2]]=h2dict[h[2]]+1
						#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
						#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
						#taking out trash
						# if (
# 							(tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) 
# 							or
# 							(tagged[0][1]=="IN" and h[0] not in ["out", "like"])
# 							or
# 							(tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"])
# 							or
# 							(tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"])
# 							or
# 							#this is where we screw up
# 							(tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"])
# 							or
# 							(tagged[2][1]=="IN")
# 							or
# 							(tagged[2][1]=="CC" and h[2] not in ["but"])
# 							or
# 							#we don't need this if we are to just ignore whatever goes thru all of it
# 							#TEMPTEMPTEMP
# 							(h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"])
# 							or
# 							(h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"])
# 							or 
# 							(tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])
# 							):
# 							#print "killed",tagged, "\n"
# 							pass
# 						
# 						#finding the good
# 						elif (
# 							(tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
# 							or
# 							(tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
# 							or
# 							(tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1]))
# 							or
# 							(h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"])
# 							or
# 							(h[0] == "like" and h[2] not in ["furry", "cuz", "straight"])
# 							or
# 							(h[0] in ["here"] and nounregex.match(tagged[2][1]))
# 							or
# 							#really what we are exluding here is anything non-Verb or Noun
# 							# we can consider replacing this with a regex
# 							(h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"])
# 							or 
# 							(h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"])
# 							or
# 							(h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"])
# 							):
# 							#print "hooked the plusloop", tagged
# 							print tagged
# 							results.append(tagged)
# 							h0dict[h[0]]=h0dict[h[0]]+1
#  							h2dict[h[2]]=h2dict[h[2]]+1
# 						else:
# 							pass
							#if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
							#	print tagged
								#print "elseloop", tagged
# 								h0dict[h[0]]=h0dict[h[0]]+1
# 								h2dict[h[2]]=h2dict[h[2]]+1
								#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
								#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
									
	
		
		print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		h0dict={k:v for k,v in h0dict.items() if v > 0}
		print "\n\n", number, "\npretext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
		print "\n\n", number, "\nposttext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)])

		print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values()))
		print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		return results

Example #6

Show file

File: rebusfinder_too_1108.py Project: patrickschu/chapter2

def rebusfinder_too(input_path):
	"""
	The rebus_too finder.
	It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count 
	instances where a writer uses "2" instead of "too". 
	"""
	predict=defaultdict(int)
	postdict=defaultdict(int)
	
	for number in [2]:
		results=[]
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					
					if (
									
					#unique items catcher
					(pre[0] in ["date"]) 
					or
					(pre[0] in ["it"] and post[0] in ["i"])
					or
					(pre[0] in ["cook"] and post[0] in ["im"])
					or
					(pre[0] in ["kids"] and post[0] in ["young"]) 
					or
					(pre[0] in ["life", "way"] and post[0] in ["short"])
					or
					(pre[0] in ["that"] and post[0] in ["hard"])
					or
					(pre[0] in ["real"] and post[0] in ["hope"])
					or
					(pre[0] in ["me"] and post[0] in ["if"])
					or
					(pre[0] in ["dogs"] and post[0] in ["if"])
					or
					(pre[0] in ["can"] and post[0] in ["but"])
					or
					(pre[0] in ["kool"] and not post[0] in ["even"])
					or
					(pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later"))
					or
					(pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"])
					or
					(pre[0] in ["real"] and post[0] in ["have"])
					or
					#BIGGER NETS
					#you be too in front of punctuation catch
					(pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"])
					or
					#this is if we know the pre-word and 2 is followed by punctuation
					# cf 'intellectualy ability 2. '
					(pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"])
					or
					#this is if we know the word to follow
					# cf 'not 2 late.' collected in postwords
					(post[0] in postwords)
					or
					#this is if we know the word to precede
					(pre[0] in prewords)
					):
					
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						results.append((pre, number, punct, post, os.path.join(input_path, pati, fil)))
						predict[pre[0]]=predict[pre[0]]+1
						postdict[post[0]]=postdict[post[0]]+1
		print "original result list is", len(results)
		seti=set(results)
		print "\n\n", seti
		print "the set is ", len(seti)
		overlap={k:results.count(k) for k in seti}
		print overlap
		print {k:overlap[k] for k in overlap if overlap[k] > 1}
		print "PRE CONTEXT"
		print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)])
		print "POST CONTEXT"
		print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])