コード例 #1
0
def clippingcounter(clipping_list, input_dir):
    """
		The clipping uses the clipping_list to count instances	of the clippings listed in there. 
		Here, we make that list out of the shorteningdict jsons created earlier. 
		The regex is designed to find lowercase and uppercase versions of each, plus plurals.
		The input_dir contains the text files to be iterated over. 
		It returns a list of match counts.
		e.g.
		clipping_list=['LOL', 'ROFL', 'ASL', 'BRB']
		result=[0,0,2,0] 
		"""
    excludelist = []

    #dicts to store results
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    results = []

    clipping_list = [
        re.compile("[^web|i]\W(" + i + ")\W")
        if i in ["cams?", "sites?"] else re.compile("\W(" + i + ")\W")
        for i in clipping_list
    ]
    #clipping_list=[re.compile("\W("+i+")\W") for i in clipping_list]
    clipping_list = set(clipping_list)
    print[i.pattern for i in clipping_list]
    #iterate and match
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili).lower()
            #result is a list of lists which contain matches for each regex/acronym
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in clipping_list]
            # o=[(r,os.path.join(input_dir, dir, fili)) for r in result if len(r[0]) > 2]
            # 				if o:
            # 					print o
            results.append([len(matches) for matches, pattern in result])
            for matches, pattern in result:
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    for entry in {k: v for k, v in matchesdicti.items() if v > 10}:
        print entry
        tk.tokenfinder([re.sub("[\(\)]", "", entry)],
                       "/Users/ps22344/Downloads/craig_0208")
    return results
コード例 #2
0
def acronymfinder(dir, length, output_json):
    """
	This finds acronyms. 
	Dir is directory of files. 
	Length is length of desired acronym. 
	"""
    start = time.time()
    capitals = re.compile("^[A-Z]+$")
    featuredict = defaultdict(int)
    # {
    #'lol':0
    # }

    for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
        print "working on", pati
        for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
            fili = codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
            inputad = ct.adtextextractor(fili.read(), fili)
            words = [w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)]
            for item in words:
                if (capitals.match(item)) and (len(item) == length):
                    if not spell.spellchecker(item.lower()):
                        featuredict[item] = featuredict[item] + 1

    print sorted(featuredict.keys())
    print "SO many entries: ", len(featuredict)

    # sorted(d.items(), key=lambda x: x[1])
    # [":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)]
    print "\n".join(
        [":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)]
    )
    mid = time.time()
    print "this took us {} minutes".format((mid - start) / 60)
    if output_json:
        with codecs.open("output_acronyms" + str(length) + "letters.json", "w", "utf-8") as outputi:
            json.dump(featuredict, outputi)
    else:
        for entry in sorted(featuredict):
            if featuredict[entry] > 5:
                print "\n\n\n***", entry, "\n\n"
                tk.tokenfinder(
                    [r"\s" + entry + "\s"],
                    input_path="/Users/ps22344/Downloads/craig_0208/",
                    length=20,
                    lower_case=False,
                )
    end = time.time()
    print "this took us {} minutes".format((end - start) / 60)
コード例 #3
0
def acronymfinder(dir, length, output_json):
	"""
	This finds acronyms. 
	Dir is directory of files. 
	Length is length of desired acronym. 
	"""
	start=time.time()
	capitals=re.compile("^[A-Z]+$")
	featuredict=defaultdict(int)
	#{
	#'lol':0
	#}
	
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		print "working on", pati
		for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			words=[w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)]
			for item in words:
				if (capitals.match(item)) and (len(item) == length):
					if not spell.spellchecker(item.lower()):
						featuredict[item] = featuredict[item]+1

	print sorted(featuredict.keys())
	print "SO many entries: ", len(featuredict)
	
	#sorted(d.items(), key=lambda x: x[1])
	#[":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)]
	print  "\n".join([":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)])
	mid=time.time()
	print "this took us {} minutes".format((mid-start)/60)
	if output_json:
		with codecs.open("output_acronyms"+str(length)+"letters.json", "w", "utf-8") as outputi:
			json.dump(featuredict, outputi)
	else:
		for entry in sorted(featuredict):
			if featuredict[entry] > 5:
				print "\n\n\n***",entry,"\n\n"
				tk.tokenfinder([r"\s"+entry+"\s"], input_path='/Users/ps22344/Downloads/craig_0208/', length=20, lower_case=False)
	end=time.time()
	print "this took us {} minutes".format((end-start)/60)
コード例 #4
0
def charactercounter(input_dir, input_dict):
	results=[]
	dicti=defaultdict(float)
	matchesdicti=defaultdict(list)
	#search_terms=set([t for i in input_dict.values() for t in i])
	search_terms=[re.compile("|".join(i)) for i in input_dict.values()]
	print "search terms",  [i.pattern for i in search_terms]
	for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
		print dir
		for fili in [i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".")]:
			with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext:
				inputad=ct.adtextextractor(inputtext.read(), fili)
			#result is a list of lists which contain matches for each regex/acronym
			#the list incomprehension just deletes empty search results from the "|" search
			result=[([t for m in i.findall(inputad) for t in m if t], i.pattern) for i in search_terms] 
			#print result
			results.append([len(matches) for matches, pattern in result])
			for matches, pattern in result:
				if len(matches) > 0:
					print "multiple matches", matches, os.path.join(input_dir, dir, fili)
				#if len(matches) > 0:
					#print len(matches)
					#the dicti is {pattern:count, pattern: count, ...}
					for res in matches[0]:
						dicti[res]=dicti[res]+1
					#print len(matches[0]), 'total', len(matches)
					#print inputad[inputad.index(matches[0])-20:inputad.index(matches[0])+20]
					#matchesdicti collects the matches per regex, dicti per feature
						matchesdicti[pattern]=matchesdicti[pattern]+matches
	#print "\n".join([":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True)])	
	for entry in {k:v for k,v in matchesdicti.items()}:
		print "\n", entry, matchesdicti[entry]
	for entry in dicti:
		print entry, dicti[entry]
	for entry in matchesdicti:
		tk.tokenfinder(["(.{,20})(?<![A-Z] [A-Z]|Ave| MA)\s+(N)\s+(?!Houston|Ballard|word|Royaton|Wilmot|Tucson|Dallas|Warren|side|Avalon|St Pete|Scottsdale|Tampa|C[Oo][Uu][Nn][Tt][Yy]|[Rr][Oo][Ll][Ll]|Arl\.|Royaltown|Golden Isles|Oeleans|Ballard Rd|Broward|Ward|angola|Oracle|[Hubert|1st] Ave|European|Tryon|Hill\w+ |Wil\w+|[Ss][Uu][Bb][Jj][Ee][Cc][Tt]|state line|for now|with a dick|OT |of (\s+Dayton|Talla\w+)|THE INSIDE|THE SURROUNDING|TIME|AUGHTY|[A-Z] [A-Z] |&amp; 5th)(.{,20})"], "/home/ps22344/Downloads/craig_0208", length= 50, lower_case=False)
	return results 
コード例 #5
0
def sampler(json_input, output_name):
	
	sampledict=defaultdict(list)
	with codecs.open(json_input, 'r', 'utf-8') as jsoninput:
		fulldict=json.load(jsoninput)
		print "length of dicti", len(fulldict)

	for entry in fulldict.keys():
		
		print entry
		samples=tk.tokenfinder(["\W"+entry+"\W"], "/Users/ps22344/Downloads/craig_0208", length=20)
		sampledict[entry]=samples
	with codecs.open("sampler_yes_"+output_name+".json", "w", "utf-8") as writejson:
		json.dump(sampledict, writejson)
	print "written to ", writejson
コード例 #6
0
def characterfinder(input_dir, input_dict):
    results = []
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    for entry in input_dict:
        print entry
    characterlist = set([re.compile(" " + i + " ") for i in input_dict.keys()])
    print[i.pattern for i in characterlist]
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili)
            #result is a list of lists which contain matches for each regex/acronym
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in characterlist]
            results.append([len(matches) for matches, pattern in result])
            for matches, pattern in result:
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    for entry in {k: v for k, v in matchesdicti.items() if v > 10}:
        print entry
        tk.tokenfinder([re.sub("[\(\)]", "", entry)],
                       "/Users/ps22344/Downloads/craig_0208",
                       lower_case=False)
    return results
コード例 #7
0
def rebusfinder_too(input_path, number_dictionary):
	"""
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
	for number in number_dictionary.keys():
		#this is for comments to self
		print "PRE"
		
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					if (post[1] in ["NNS"]) and (punct[0] in [" "]):
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						search_pattern=[re.escape(i) for i in [pre[0],number[0], punct[0], post[0]]]
						if search_pattern not in previous_patterns:
							tk.tokenfinder(["\s*".join(search_pattern)], dir)
							previous_patterns.append(search_pattern)
						else:
							print "SEE TOKENFINDER RESULTS ABOVE\n"			
						#error catching here 
						#
				
				
				
				# for h in hits:
# 					if h[2]:#==".":
# 						print  h, os.path.join(input_path, pati, fil)
# 						print pos_tag(h), "\n"
						
					#if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context):
						#tagged=pos_tag(h), fil
						#print tagged
						#if h[2] not in [" "]:
						#	print tagged, os.path.join(input_path, pati, fil)
							#print inputad
						#h0dict[h[0]]=h0dict[h[0]]+1
 						#h2dict[h[2]]=h2dict[h[2]]+1
						#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
						#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
						#taking out trash
						# if (
# 							(tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) 
# 							or
# 							(tagged[0][1]=="IN" and h[0] not in ["out", "like"])
# 							or
# 							(tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"])
# 							or
# 							(tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"])
# 							or
# 							#this is where we screw up
# 							(tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"])
# 							or
# 							(tagged[2][1]=="IN")
# 							or
# 							(tagged[2][1]=="CC" and h[2] not in ["but"])
# 							or
# 							#we don't need this if we are to just ignore whatever goes thru all of it
# 							#TEMPTEMPTEMP
# 							(h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"])
# 							or
# 							(h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"])
# 							or 
# 							(tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])
# 							):
# 							#print "killed",tagged, "\n"
# 							pass
# 						
# 						#finding the good
# 						elif (
# 							(tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
# 							or
# 							(tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
# 							or
# 							(tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1]))
# 							or
# 							(h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"])
# 							or
# 							(h[0] == "like" and h[2] not in ["furry", "cuz", "straight"])
# 							or
# 							(h[0] in ["here"] and nounregex.match(tagged[2][1]))
# 							or
# 							#really what we are exluding here is anything non-Verb or Noun
# 							# we can consider replacing this with a regex
# 							(h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"])
# 							or 
# 							(h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"])
# 							or
# 							(h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"])
# 							):
# 							#print "hooked the plusloop", tagged
# 							print tagged
# 							results.append(tagged)
# 							h0dict[h[0]]=h0dict[h[0]]+1
#  							h2dict[h[2]]=h2dict[h[2]]+1
# 						else:
# 							pass
							#if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
							#	print tagged
								#print "elseloop", tagged
# 								h0dict[h[0]]=h0dict[h[0]]+1
# 								h2dict[h[2]]=h2dict[h[2]]+1
								#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
								#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
									
	
		
		print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		h0dict={k:v for k,v in h0dict.items() if v > 0}
		print "\n\n", number, "\npretext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
		print "\n\n", number, "\nposttext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)])

		print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values()))
		print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		return results