def wordcounter(folder, list_of_clusters):
    with codecs.open(
        "/Users/ps22344/Downloads/chapter2/current/clusterskmeans_54_19_10_07_30.json", "r", "utf-8"
    ) as jsoninput:
        wordtovecclusters = json.load(jsoninput)
    wordtovecclusters = {int(k): set(v["words"]) for k, v in wordtovecclusters.items() if int(k) in list_of_clusters}
    for key in wordtovecclusters:
        start = time.time()
        print key
        wordcount = {i: 0 for i in wordtovecclusters[key]}
        filis = folderreader(folder)
        print "we have {} files to work with".format(len(filis))
        for fili in filis:
            with codecs.open(fili, "r", "utf-8") as inputfile:
                inputad = ct.adtextextractor(inputfile.read(), fili)
            addspace = stopregex.sub(r"\g<1> \g<2>", inputad)
            splittext = nltk.word_tokenize(addspace)
            splittextlo = [s.lower() for s in splittext if s]
            splittextlo = [s for s in splittextlo if not s in cluster_stopwords]
            if "wan" in splittextlo:
                print splittextlo
                print inputad
            for w in wordcount.keys():
                wordcount[w] = wordcount[w] + splittextlo.count(w)
        end = time.time()
        print "Aha this took us {} minutes".format((end - start) / 60)
        print "\n", key
        print [(k, wordcount[k]) for k in sorted(wordcount, key=wordcount.get, reverse=True)]
def tokenfinder(input_list, input_path, lower_case):
	"""
	the tokenfinder looks over the items in an input_list.
	the regex pattern is ".{,40}ITEM.{,40}".	
	it outputs findings in the corpus given in "dir".
	filename and total number of matches are printed. 
	dir needs to have subfolders. 
	"""
	starttime=time.time()
	print "search term is ", input_list
	#construct the regexes
	typedict={}
	for item in input_list:
		typedict[re.compile(r".{,40}"+unicode(item)+".{,40}")]=0
	for typi in typedict:
		print typi.pattern	
	totalhits=[]
	#iterate over files
	for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
		#print pati
		for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			if lower_case:
				inputad=inputad.lower()
			matches=[k.findall(inputad) for k in typedict.keys()]
			if sum([len(i) for i in matches]) > 0:
				print "{} hits in file {}".format(sum([len(i) for i in matches]), os.path.join(input_path, pati, fil))
				print matches, "\n"
				totalhits.append(sum([len(i) for i in matches]))
	if sum(totalhits) == 0:
		print "\n---\nNO MATCHES IN TOKENFINDER\n---\n"
	else:
		print "{} matches total".format(sum(totalhits))
	endtime=time.time()
예제 #3
0
def dictbuilder(input_path, output_file):
	"""
	reads files in input_path	
	input_path needs to have subfolders. 
	if "remove_numbers", does not count numbers (as in "\d+").
	This was used for IDing leetspeak.
	"""
	worddict=defaultdict(int)
	for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
		print pati
		for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fil)
			inputad=inputad.lower()
			tokenized=ct.tokenizer(inputad)
			tokenized=[re.sub("\W","", i) for i in tokenized]
			if remove_numbers:
				tokenized=[i for i in tokenized if not re.match("\d+", i)]
			for token in [i for i in tokenized if i]:
				worddict[token]=worddict[token]+1
	print ("\n".join([":".join((k, unicode(worddict[k]))) for k in sorted(worddict, key=worddict.get, reverse=True) if worddict[k] > 50]))
	print "We created a dictionary of {} total words with {} types".format(sum(worddict.values()), len(worddict.keys()))		
	if output_file:
		with codecs.open(output_file, "w", "utf-8") as outputfile:
			json.dump(worddict, outputfile)	
			print "Dict written to ", outputfile
def categorymachine(folderlist):

    print "starting category machine"
    catdicti = {}
    catnumber = 0
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputtext = ct.adtextextractor(inputfile, fili)
            #lets establish the category
            #we need to make it numeric, so the numpy won't screw up
            category = ct.tagextractor(inputfile, "category1", fili)
            try:
                cat = catdicti[category]
            except:
                print "We added {} to the category dictionary, coded as {}".format(
                    category, catnumber)
                catdicti[ct.tagextractor(inputfile, "category1",
                                         fili)] = catnumber
                catnumber = catnumber + 1
                cat = catdicti[ct.tagextractor(inputfile, "category1", fili)]
    return (catdicti, catnumber)
예제 #5
0
def emoticonfinder(dir):
	"""
	The emoticonfinder takes a directory with corpus files as input. 
	We might consider making the file with emoticons an argument as well. 
	The emoticonfinder creates a list of relevant emoticons from a text file. 
	Then counts how often they occur in files in dir.
	--- Source file is /Users/ps22344/Downloads/chapter2/current/emoticoncounter.py ---
	"""
	starttime=time.time()
	#creating a featuredict from file
	featuredict={}
	with codecs.open('/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext:
		for line in inputtext.readlines():
			featuredict[line.rstrip("\n")]=0
	#test formatting
	for k in featuredict:
		if k.startswith(" "):
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		print pati
		for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			words=ct.tokenizer(inputad)
			for item in words:
				if item in featuredict:
					featuredict[item] = featuredict[item]+1
	print featuredict
	endtime=time.time()
	print "This took us {} minutes".format((endtime-starttime)/60)
예제 #6
0
def nonstandardcounter(filelist):
	"""
	The nonstandardcounter takes a list of files, then iterates over them. 
	Splits according to the same rules as the matrixmachine.
	Checks status of each word in PyEnchant (en_US plus mydictwords.txt) and counts how many are "False".
	Counts theses, returns dictionary of counts per word.
	It outputs the results as a JSON w/ the file name including year, month, and day.
	"""	
	
	count=0
	filedict={}
	typodict=defaultdict(float)
	for fili in filelist:
		#print fili
		#print os.path.join(pathi,  fili)
		inputfile=codecs.open(os.path.join(pathi,  fili), "r", "utf-8").read()
		inputad=ct.adtextextractor(inputfile, fili)
		count=count+1
		filedict[count]=os.path.join(pathi,  fili)
		addspace=stopregex.sub(r"\g<1> \g<2>", inputad)
		addspace=re.sub("<.*?>", " ", addspace)
		splittext=nltk.word_tokenize(addspace)
		#splittext=[s for s in splittext if s not in exclude]
		splittextlo=[s for s in splittext if s]
		for word in [w for w in splittextlo if not spelldicti.check(w) and w not in list(punctuation)]:
			if word == "nofollow":
				print splittextlo
			typodict[word]=typodict[word]+1
	return (typodict)
예제 #7
0
def dictbuilder(input_path, output_file):
    """
	reads files in input_path	
	input_path needs to have subfolders. 
	if "remove_numbers", does not count numbers (as in "\d+").
	This was used for IDing leetspeak.
	"""
    worddict = defaultdict(int)
    for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
        print pati
        for fil in [
                i for i in os.listdir(os.path.join(input_path, pati))
                if not i.startswith(".")
        ]:
            fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                               "utf-8")
            inputad = ct.adtextextractor(fili.read(), fil)
            inputad = inputad.lower()
            tokenized = ct.tokenizer(inputad)
            tokenized = [re.sub("\W", "", i) for i in tokenized]
            if remove_numbers:
                tokenized = [i for i in tokenized if not re.match("\d+", i)]
            for token in [i for i in tokenized if i]:
                worddict[token] = worddict[token] + 1
    print("\n".join([
        ":".join((k, unicode(worddict[k])))
        for k in sorted(worddict, key=worddict.get, reverse=True)
        if worddict[k] > 50
    ]))
    print "We created a dictionary of {} total words with {} types".format(
        sum(worddict.values()), len(worddict.keys()))
    if output_file:
        with codecs.open(output_file, "w", "utf-8") as outputfile:
            json.dump(worddict, outputfile)
            print "Dict written to ", outputfile
예제 #8
0
def dictmaker(folderlist, threshold=1000):
	#this is our general vocab
	vocab={}
	#collecting words
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")]
		print "Building vocab: we have {} files in folder {}".format(len(filis), folder)
		#collect a dictionary with all words
		#lowercase them    
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			splittext=nltk.word_tokenize(inputtext)
			splittextlo=[i.lower() for i in splittext]
			#do we want to lemmatize or things like that
			for word in splittextlo:
				if word not in vocab:
					vocab[word]=1
				else:
					vocab[word]=vocab[word]+1
	print "Our vocab dictionary has {} entries".format(len(vocab))
	#here we set the threshold
	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
	return featuredict
def collofinder(main_term,regex):
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		"""
		this looks over the keys of a dictionary that are regex patterns. 
		it outputs findings in the corpus given in "dir" with context.
		dir needs to have subfolders. 
		the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
		"""
		print pati
		for fil in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fil), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			inputad=tagregex.sub(" ", inputad)
			words=ct.tokenizer(inputad)
			words=[w.lower() for w in words]
			#specific words processing for numbers: introduce space between number immediately followed by word-character
			hits=[w for w in words if regex.match(w) ]
			#determines length of context extracted
			context=[-3,-2,-1,0, 1,2, 3]
			for matched in hits:
				if [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in words:
					print "too long"
					print [words[words.index(matched)+t] for t in [c for c in context if c <1 ]]
				elif hits and not [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in [words[words.index(matched)+t] for t in [-1,1]] :
					print fil
					print [words[words.index(matched)+t] for t in context]
def wordcounter(folder, list_of_clusters):
    with codecs.open(
            '/Users/ps22344/Downloads/chapter2/current/clusterskmeans_54_19_10_07_30.json',
            'r', 'utf-8') as jsoninput:
        wordtovecclusters = json.load(jsoninput)
    wordtovecclusters = {
        int(k): set(v['words'])
        for k, v in wordtovecclusters.items() if int(k) in list_of_clusters
    }
    for key in wordtovecclusters:
        start = time.time()
        print key
        wordcount = {i: 0 for i in wordtovecclusters[key]}
        filis = folderreader(folder)
        print "we have {} files to work with".format(len(filis))
        for fili in filis:
            with codecs.open(fili, "r", "utf-8") as inputfile:
                inputad = ct.adtextextractor(inputfile.read(), fili)
            addspace = stopregex.sub(r"\g<1> \g<2>", inputad)
            splittext = nltk.word_tokenize(addspace)
            splittextlo = [s.lower() for s in splittext if s]
            splittextlo = [
                s for s in splittextlo if not s in cluster_stopwords
            ]
            if "wan" in splittextlo:
                print splittextlo
                print inputad
            for w in wordcount.keys():
                wordcount[w] = wordcount[w] + splittextlo.count(w)
        end = time.time()
        print "Aha this took us {} minutes".format((end - start) / 60)
        print "\n", key
        print[(k, wordcount[k])
              for k in sorted(wordcount, key=wordcount.get, reverse=True)]
def dictmaker(folderlist, threshold, remove_stopwords=True, remove_punct=True):
    """
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	It returns a dictionary of all words that occur more often than the number threshold. 
	remove_stopwords used the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	"""
    #threshold sets how many times a word needs to occur to be included in the featuredict
    vocab = {}
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        print "Building vocab: we have {} files in folder {}".format(
            len(filis), folder)
        #collect a dictionary with all words
        #lowercase them
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputtext = ct.adtextextractor(inputfile, fili)
            splittext = nltk.word_tokenize(inputtext)
            splittextlo = [i.lower() for i in splittext]
            #do we want to lemmatize or things like that
            for word in splittextlo:
                if word not in vocab:
                    vocab[word] = 1
                else:
                    vocab[word] = vocab[word] + 1
    print "Our vocab dictionary has {} entries".format(len(vocab))
    ct.dictwriter(
        os.path.join("~/", chapterdir[0], "outputfiles",
                     "fulldict_" + time.strftime("%H_%M_%m_%d")), vocab)
    if remove_stopwords:
        vocab = {
            key: value
            for key, value in vocab.items() if key not in stopwords
        }
        print "After stop word removal, dict is {} long".format(len(vocab))
    if remove_punct:
        vocab = {
            key: value
            for key, value in vocab.items() if key not in punctuation
        }
        print "After punctuation removal, dict is {} long".format(len(vocab))
    featuredict = {
        key: value
        for key, value in vocab.items() if value > float(threshold)
    }
    print "Our feature dictionary has {} entries\n---------------\n".format(
        len(featuredict))
    print "This is our featuredict", featuredict
    ct.dictwriter(
        os.path.join("~/", chapterdir[0], "outputfiles",
                     "featuredict_" + time.strftime("%H_%M_%m_%d")),
        featuredict)
    return featuredict
예제 #12
0
def clippingcounter(clipping_list, input_dir):
    """
		The clipping uses the clipping_list to count instances	of the clippings listed in there. 
		Here, we make that list out of the shorteningdict jsons created earlier. 
		The regex is designed to find lowercase and uppercase versions of each, plus plurals.
		The input_dir contains the text files to be iterated over. 
		It returns a list of match counts.
		e.g.
		clipping_list=['LOL', 'ROFL', 'ASL', 'BRB']
		result=[0,0,2,0] 
		"""
    excludelist = []

    #dicts to store results
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    results = []

    clipping_list = [
        re.compile("[^web|i]\W(" + i + ")\W")
        if i in ["cams?", "sites?"] else re.compile("\W(" + i + ")\W")
        for i in clipping_list
    ]
    #clipping_list=[re.compile("\W("+i+")\W") for i in clipping_list]
    clipping_list = set(clipping_list)
    print[i.pattern for i in clipping_list]
    #iterate and match
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili).lower()
            #result is a list of lists which contain matches for each regex/acronym
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in clipping_list]
            # o=[(r,os.path.join(input_dir, dir, fili)) for r in result if len(r[0]) > 2]
            # 				if o:
            # 					print o
            results.append([len(matches) for matches, pattern in result])
            for matches, pattern in result:
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    for entry in {k: v for k, v in matchesdicti.items() if v > 10}:
        print entry
        tk.tokenfinder([re.sub("[\(\)]", "", entry)],
                       "/Users/ps22344/Downloads/craig_0208")
    return results
예제 #13
0
def vec2wordclustercounter(folderlist, cluster_dictionary):
    """
	This is stolen from the cluster_analysis dictmaker. 
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	remove_stopwords uses the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	This was mainly used to test how well the counting in the word2vec analysis works.
	"""
    with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson:
        clusterdict = json.load(inputjson)
    result = defaultdict(int)
    #this is just for qc
    misses = []
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        print "Building vocab: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputtext = ct.adtextextractor(inputfile, fili)
            #pre-processing here
            inputtext = ct.adcleaner(inputtext,
                                     replace_linebreak=True,
                                     remove_html=False)
            splittext = word_tokenize(inputtext)
            splittextlo = [i.lower() for i in splittext]
            finaltext = [punctuationregex.sub("", i) for i in splittextlo]
            finaltext = [i for i in finaltext if i and i not in ['br']]
            #do we want to lemmatize or things like that
            for word in finaltext:
                cluster = [
                    k for k, v in clusterdict.items() if word in v['words']
                ]
                if len(cluster) > 1:
                    print "Warning: The item {} was found in more than one clusters".format(
                        word)
                if len(cluster) < 1:
                    #print "Warning: The item could not be found in a cluster"
                    misses.append(word)
                else:
                    result[cluster[0]] = result[cluster[0]] + 1
    print "Our vocab dictionary has {} entries".format(len(result))
    ct.dictwriter(
        os.path.join("~/", chapterdir[0], "outputfiles",
                     "fulldict_" + time.strftime("%H_%M_%m_%d")), result)
    # 	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
    # 	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
    # 	print "This is our featuredict", featuredict
    # 	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
    print "misses", len(misses), set(misses)
    print result
    return result
예제 #14
0
def acronymcounter(acronym_list, input_dir):
    """
		The acronymcounter uses the acronym_list to count instances	of the abbreviations listed in there. 
		Here, we make that list out of the shorteningdict jsons created earlier. 
		The regex is designed to find lowercase and uppercase versions of each, plus plurals.
		The input_dir contains the text files to be iterated over. 
		It returns a list of match counts.
		e.g.
		acronym_list=['LOL', 'ROFL', 'ASL', 'BRB']
		result=[0,0,2,0] 
		NOTE:we can consider running location and schools over a different regex that does not include plural s.
		"""
    excludelist = set([
        "oks", "fbs", "PSS", "VAS", "vas", "BCS", "bcs", "NES", "nes", "SMS",
        "sms", "SAS", "SSS", "sss", "nsas", "mias"
    ])

    #dicts to store results
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    results = []

    #regex, lower and pluralize
    acronym_list = [
        re.compile("\W((?:" + i + "|" + i.lower() + ")[sS]?)\W")
        for i in acronym_list
    ]
    acronym_list = set(acronym_list)
    print[i.pattern for i in acronym_list]
    #iterate and match
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili)
            #result is a list of lists which contain matches for each regex/acronym
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in acronym_list]
            results.append([len(matches) for matches, pattern in result])
            for matches, pattern in result:
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    return results
예제 #15
0
def acronymfinder(dir, length, output_json):
    """
	This finds acronyms. 
	Dir is directory of files. 
	Length is length of desired acronym. 
	"""
    start = time.time()
    capitals = re.compile("^[A-Z]+$")
    featuredict = defaultdict(int)
    # {
    #'lol':0
    # }

    for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
        print "working on", pati
        for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
            fili = codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
            inputad = ct.adtextextractor(fili.read(), fili)
            words = [w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)]
            for item in words:
                if (capitals.match(item)) and (len(item) == length):
                    if not spell.spellchecker(item.lower()):
                        featuredict[item] = featuredict[item] + 1

    print sorted(featuredict.keys())
    print "SO many entries: ", len(featuredict)

    # sorted(d.items(), key=lambda x: x[1])
    # [":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)]
    print "\n".join(
        [":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)]
    )
    mid = time.time()
    print "this took us {} minutes".format((mid - start) / 60)
    if output_json:
        with codecs.open("output_acronyms" + str(length) + "letters.json", "w", "utf-8") as outputi:
            json.dump(featuredict, outputi)
    else:
        for entry in sorted(featuredict):
            if featuredict[entry] > 5:
                print "\n\n\n***", entry, "\n\n"
                tk.tokenfinder(
                    [r"\s" + entry + "\s"],
                    input_path="/Users/ps22344/Downloads/craig_0208/",
                    length=20,
                    lower_case=False,
                )
    end = time.time()
    print "this took us {} minutes".format((end - start) / 60)
예제 #16
0
def capsfinder(input_dir, input_dict):
    results = []
    #dicti is results by word/item
    dicti = defaultdict(float)
    #matchesdicti is results by Regexpattern
    matchesdicti = defaultdict(list)
    search_terms = [i for i in input_dict.keys()]
    print "search terms", [i.pattern for i in search_terms]
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili)
            #we exclude anything we have in our abbreviations dict
            #no, we cover this by subtracting the results later
            result = [
                ([t for t in i.findall(inputad)
                  if not t in abbreviations], i.pattern) for i in search_terms
            ]
            #print result
            if len(result) > 1:
                print "warning result > 1", len(result), result
            #this is the count we returs
            results.append([len(matches) for matches, pattern in result])
            #here we inspect findings. note resultS vs result
            for matches, pattern in result:
                if len(matches) > 100:
                    print "matches", len(matches), os.path.join(
                        input_dir, dir, fili)
                    #the dicti is {pattern:count, pattern: count, ...}
                for res in matches:
                    dicti[res] = dicti[res] + 1
                    #print len(matches[0]), 'total', len(matches)
                    #matchesdicti collects the matches per regex, dicti per feature
                    matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    # for entry in {k:v for k,v in matchesdicti.items()}:
    # 		print "\n", entry, matchesdicti[entry]
    # 	for entry in dicti:
    # 		print entry, dicti[entry]
    return results
예제 #17
0
def tokenfinder(input_list, input_path, length=40, lower_case=True):
    """
	the tokenfinder looks over the items in an input_list.
	the regex pattern is ".{,40}ITEM.{,40}".	
	it outputs findings in the corpus given in "dir".
	filename and total number of matches are printed. 
	dir needs to have subfolders. 
	"""
    starttime = time.time()
    allhits = []
    print "search term is ", input_list
    #construct the regexes
    typedict = {}
    for item in input_list:
        typedict[re.compile(r".{," + str(length) + "}" + unicode(item) +
                            ".{," + str(length) + "}")] = 0
    for typi in typedict:
        print "***", typi.pattern
    totalhits = []
    #iterate over files
    for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
        #print pati
        for fil in [
                i for i in os.listdir(os.path.join(input_path, pati))
                if not i.startswith(".")
        ]:
            fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                               "utf-8")
            inputad = ct.adtextextractor(fili.read(), fili)
            if lower_case:
                #print "were lowercasing"
                inputad = inputad.lower()
            matches = [k.findall(inputad) for k in typedict.keys()]
            if sum([len(i) for i in matches]) > 0:
                print "{} hits in file {}".format(
                    sum([len(i) for i in matches]),
                    os.path.join(input_path, pati, fil))
                print matches, "\n"
                totalhits.append(sum([len(i) for i in matches]))
                allhits.append(matches)
    if sum(totalhits) == 0:
        print "\n---\nNO MATCHES IN TOKENFINDER\n---\n"
    else:
        print "{} matches total".format(sum(totalhits))
    endtime = time.time()
    return allhits
예제 #18
0
def vec2wordclustercounter(folderlist, cluster_dictionary):
	"""
	This is stolen from the cluster_analysis dictmaker. 
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	remove_stopwords uses the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	This was mainly used to test how well the counting in the word2vec analysis works.
	"""
	with codecs.open(cluster_dictionary, "r", "utf-8") as inputjson:
		clusterdict=json.load(inputjson)
	result=defaultdict(int)
	#this is just for qc
	misses=[]
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")]
		print "Building vocab: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			#pre-processing here
			inputtext=ct.adcleaner(inputtext ,replace_linebreak=True, remove_html=False)
			splittext=word_tokenize(inputtext)
			splittextlo=[i.lower() for i in splittext]	
			finaltext=[punctuationregex.sub("",i) for i in splittextlo]
			finaltext=[i for i in finaltext if i and i not in ['br']]	
			#do we want to lemmatize or things like that
			for word in finaltext:
				cluster= [k for k,v in clusterdict.items() if word in v['words']]
				if len(cluster) > 1:
					print "Warning: The item {} was found in more than one clusters".format(word)
				if len(cluster) < 1:
					#print "Warning: The item could not be found in a cluster"
					misses.append(word)
				else:
					result[cluster[0]]=result[cluster[0]]+1
	print "Our vocab dictionary has {} entries".format(len(result))
	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), result)
# 	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
# 	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
# 	print "This is our featuredict", featuredict
# 	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
	print "misses", len(misses), set(misses)
	print result
	return result
예제 #19
0
def rebusfinder(input_path, word_dictionary, number_dictionary, excluded_words):
	"""
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It print the results and give type and token counts. 
	
	"""
	#with codecs.open(word_dictionary, "r", "utf-8") as worddictionary:
	#	worddictionary=json.load(worddictionary)
	#worddictionary={k:v for k,v in worddictionary.items() if not k in excluded_words and worddictionary[k] > 1}
	for number in number_dictionary.keys():
		numberregex=re.compile("\W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W")
		#just for now
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		print numberregex.pattern
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					if h[0] in include_pre_context or h[2] in include_post_context:
						print h
						h0dict[h[0]]=h0dict[h[0]]+1
						h2dict[h[2]]=h2dict[h[2]]+1
					elif h[0] not in exclude_pre_context and h[2] not in exclude_post_context:
						if h[2]:#:=="days":
							print h
							h0dict[h[0]]=h0dict[h[0]]+1
							h2dict[h[2]]=h2dict[h[2]]+1
		print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		h0dict={k:v for k,v in h0dict.items() if v > 0}
		print "\n\n", number, "\n\posttext here be the results\n\n"
		#print "\n".join([": ".join([k, unicode(h0dict[k])]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
		print "\n".join([": ".join([k, unicode(h2dict[k])]) for k in sorted(h2dict, key=h2dict.get, reverse=True)])

		print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values()))
		print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
예제 #20
0
def acronymfinder(dir, length, output_json):
	"""
	This finds acronyms. 
	Dir is directory of files. 
	Length is length of desired acronym. 
	"""
	start=time.time()
	capitals=re.compile("^[A-Z]+$")
	featuredict=defaultdict(int)
	#{
	#'lol':0
	#}
	
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		print "working on", pati
		for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			words=[w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)]
			for item in words:
				if (capitals.match(item)) and (len(item) == length):
					if not spell.spellchecker(item.lower()):
						featuredict[item] = featuredict[item]+1

	print sorted(featuredict.keys())
	print "SO many entries: ", len(featuredict)
	
	#sorted(d.items(), key=lambda x: x[1])
	#[":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)]
	print  "\n".join([":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)])
	mid=time.time()
	print "this took us {} minutes".format((mid-start)/60)
	if output_json:
		with codecs.open("output_acronyms"+str(length)+"letters.json", "w", "utf-8") as outputi:
			json.dump(featuredict, outputi)
	else:
		for entry in sorted(featuredict):
			if featuredict[entry] > 5:
				print "\n\n\n***",entry,"\n\n"
				tk.tokenfinder([r"\s"+entry+"\s"], input_path='/Users/ps22344/Downloads/craig_0208/', length=20, lower_case=False)
	end=time.time()
	print "this took us {} minutes".format((end-start)/60)
def dictmaker(folderlist, threshold, remove_stopwords=True, remove_punct=True):
	"""
	The dictmaker counts the words / items contained in the files found in the folders of folderlist.
	It returns a dictionary of all words that occur more often than the number threshold. 
	remove_stopwords used the stopword list defined above to ignore words. 
	remove_punct works with string.punctuation, cf above. 
	"""
	#threshold sets how many times a word needs to occur to be included in the featuredict
	vocab={}
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")]
		print "Building vocab: we have {} files in folder {}".format(len(filis), folder)
		#collect a dictionary with all words
		#lowercase them    
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			#pre-processing here
			inputtext=adcleaner(inputtext ,replace_linebreak=True, remove_html=False)
			splittext=word_tokenize(inputtext)
			splittextlo=[i.lower() for i in splittext]	
			finaltext=[punctuationregex.sub("",i) for i in splittextlo]
			finaltext=[i for i in finaltext if i and i not in ['br']]	
			#do we want to lemmatize or things like that
			for word in finaltext:
				if word not in vocab:
					vocab[word]=1
				else:
					vocab[word]=vocab[word]+1
	print "Our vocab dictionary has {} entries".format(len(vocab))
	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "fulldict_"+time.strftime("%H_%M_%m_%d")), vocab)
	if remove_stopwords:
		vocab= {key:value for key, value in vocab.items() if key not in stopwords }
		print "After stop word removal, dict is {} long".format(len(vocab))
	if remove_punct:
		vocab= {key:value for key, value in vocab.items() if key not in punctuation }
		print "After punctuation removal, dict is {} long".format(len(vocab))
	featuredict= {key:value for key, value in vocab.items() if value > float(threshold) }
	print "Our feature dictionary has {} entries\n---------------\n".format(len(featuredict))
	print "This is our featuredict", featuredict
	ct.dictwriter(os.path.join("~/", chapterdir[0], "outputfiles", "featuredict_"+time.strftime("%H_%M_%m_%d")), featuredict)
	return featuredict
def categorymachine(folderlist):
	print "starting category machine"
	catdicti={}
	catnumber=0
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith (".")]
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder,fili), "r", "utf-8").read()
			inputtext=ct.adtextextractor(inputfile, fili)
			# lets establish the category
			# we need to make it numeric, so the numpy won't screw up
			category=ct.tagextractor(inputfile, "category1", fili)
			try: 
				cat=catdicti[category]
			except:
				print "We added {} to the category dictionary, coded as {}".format(category, catnumber)
				catdicti[ct.tagextractor(inputfile, "category1", fili)]=catnumber
				catnumber=catnumber+1
				cat=catdicti[ct.tagextractor(inputfile, "category1", fili)]
	return (catdicti, catnumber)
def collofinder(main_term, regex):
    for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
        """
		this looks over the keys of a dictionary that are regex patterns. 
		it outputs findings in the corpus given in "dir" with context.
		dir needs to have subfolders. 
		the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
		"""
        print pati
        for fil in [
                i for i in os.listdir(os.path.join(dir, pati))
                if not i.startswith(".")
        ]:
            fili = codecs.open(os.path.join(dir, pati, fil), "r", "utf-8")
            inputad = ct.adtextextractor(fili.read(), fili)
            inputad = tagregex.sub(" ", inputad)
            words = ct.tokenizer(inputad)
            words = [w.lower() for w in words]
            #specific words processing for numbers: introduce space between number immediately followed by word-character
            hits = [w for w in words if regex.match(w)]
            #determines length of context extracted
            context = [-3, -2, -1, 0, 1, 2, 3]
            for matched in hits:
                if [
                        i for i in context
                        if words.index(matched) + i > len(words) - 1
                ] and search_term in words:
                    print "too long"
                    print[
                        words[words.index(matched) + t]
                        for t in [c for c in context if c < 1]
                    ]
                elif hits and not [
                        i for i in context
                        if words.index(matched) + i > len(words) - 1
                ] and search_term in [
                        words[words.index(matched) + t] for t in [-1, 1]
                ]:
                    print fil
                    print[words[words.index(matched) + t] for t in context]
예제 #24
0
def wordcounter(input_dir, category_tag, category_dict):
    """
	counts the words per category in the files in input_dir.
	
	Parameters
	----------
	input_dir is the corpus directoty
	category_tag is the name of the tag to be extracted with tagextractor. 
	category_dict is a dictionary of categories to be computed over (category names as keys)
	e.g. <location="X"> would be input with "location" as the category_tag and a dict with {"Austin":0, "Dallas":0, ...}
	Returns
	-------
	something
	"""
    print "Running the wordcounter"
    resultdict = category_dict
    for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print pati
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, pati))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, pati, fili), "r",
                             "utf-8") as inputfili:
                inputfili = inputfili.read()
            wordcount = len(
                ct.tokenizer(ct.adtextextractor(inputfili, fili),
                             remove_punctuation=True))
            category = ct.tagextractor(inputfili, category_tag, fili)
            if category in resultdict:
                resultdict[category] = resultdict[category] + wordcount
            else:
                print "\n\nWARNING:\n{} is not in the category_dict. What do we do now?\n\n".format(
                    category)
    print "Wordcounter done"

    with codecs.open("wordcounter_" + category_tag + ".json", "w",
                     "utf-8") as jsonout:
        json.dump(resultdict, jsonout)
예제 #25
0
def dictbuilder(input_dir, output_name, lowercase=False, print_dict=False):
    """
	The dictbuilder puts all words in the corpus (input_dir) into a dictionary and outputs as json. 
	Name of output file determined by output_name.
	If print_dict is set to True, prints our sorted dictionary.
	Format of the dict returned: {word:count, word:count, }
	"""
    dicti = defaultdict(float)
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili)
            inputad = [
                w.rstrip(string.punctuation).lstrip(string.punctuation)
                for w in ct.tokenizer(inputad)
            ]
            inputad = [w for w in inputad if w]
            if lowercase:
                for word in inputad:
                    dicti[word.lower()] = dicti[word.lower()] + 1
            else:
                for word in inputad:
                    dicti[word] = dicti[word] + 1
    if print_dict:
        print "\n".join([
            ":".join((i, str(dicti[i])))
            for i in sorted(dicti, key=dicti.get, reverse=True)
        ])
    with codecs.open(output_name + ".json", "w", "utf-8") as outputi:
        json.dump(dicti, outputi, encoding="utf8")
    print "Written dictionary with {} items to ".format(
        len(dicti)), output_name
    return dicti
예제 #26
0
def spellingcounter(input_dir):
    """
    The spellingcounter counts the number of mis-spelled words.
    It uses the PyEnchange library for spellchecking.
    It iterates over the files in input_dir.
    It returns a lists of lists with (raw count, relative count) tuples.
    """
    start=time.time()
    americandict = enchant.Dict("en_US")
    goodwords=set(["wo", "'ve", "'m", "n't", "'s", "'ll", "'re", "'d", "non-"]+list(string.punctuation))
    htmlregex=re.compile("<.*?>")
    results=[]
    for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print pati
        for fili in [i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".")]:
            #print fili
            result=[]
            fili=codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8")
            inputad=ct.adtextextractor(fili.read(), fili)
            inputad=htmlregex.sub(" ", inputad)
            words=ct.tokenizer(inputad)
            #print "\n\n\n", words
            wordcount=float(len(words))
            mistakes=[w for w in words if not americandict.check(w) and w not in goodwords]
            #print mistakes
            if wordcount-len(mistakes) < 0:
                 print "WARNING: negative count-mistakes", wordcount, len(correct), os.path.join(input_dir, pati, fili)
            results.append([(len(mistakes), len(mistakes)/wordcount)])
            #print "\n".join([":".join([i, str(dict[i])]) for i in sorted(dict, key=dict.get, reverse=True)])
    end=time.time()
    print "len results", len(results)
    print "this took us {} minutes".format((end-start)/60)
    print "shape of results, number of lists:", len(results),  "-- length of lists", set([len(i) for i in results])
    #for u in [[x[1] for x in i] for i in results]:
    #    print u
    #print [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]
    return [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]
예제 #27
0
def charactercounter(input_dir, input_dict):
	results=[]
	dicti=defaultdict(float)
	matchesdicti=defaultdict(list)
	#search_terms=set([t for i in input_dict.values() for t in i])
	search_terms=[re.compile("|".join(i)) for i in input_dict.values()]
	print "search terms",  [i.pattern for i in search_terms]
	for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
		print dir
		for fili in [i for i in os.listdir(os.path.join(input_dir, dir)) if not i.startswith(".")]:
			with codecs.open(os.path.join(input_dir, dir, fili), "r", "utf-8") as inputtext:
				inputad=ct.adtextextractor(inputtext.read(), fili)
			#result is a list of lists which contain matches for each regex/acronym
			#the list incomprehension just deletes empty search results from the "|" search
			result=[([t for m in i.findall(inputad) for t in m if t], i.pattern) for i in search_terms] 
			#print result
			results.append([len(matches) for matches, pattern in result])
			for matches, pattern in result:
				if len(matches) > 0:
					print "multiple matches", matches, os.path.join(input_dir, dir, fili)
				#if len(matches) > 0:
					#print len(matches)
					#the dicti is {pattern:count, pattern: count, ...}
					for res in matches[0]:
						dicti[res]=dicti[res]+1
					#print len(matches[0]), 'total', len(matches)
					#print inputad[inputad.index(matches[0])-20:inputad.index(matches[0])+20]
					#matchesdicti collects the matches per regex, dicti per feature
						matchesdicti[pattern]=matchesdicti[pattern]+matches
	#print "\n".join([":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i])))) for i in sorted(dicti, key=dicti.get, reverse=True)])	
	for entry in {k:v for k,v in matchesdicti.items()}:
		print "\n", entry, matchesdicti[entry]
	for entry in dicti:
		print entry, dicti[entry]
	for entry in matchesdicti:
		tk.tokenfinder(["(.{,20})(?<![A-Z] [A-Z]|Ave| MA)\s+(N)\s+(?!Houston|Ballard|word|Royaton|Wilmot|Tucson|Dallas|Warren|side|Avalon|St Pete|Scottsdale|Tampa|C[Oo][Uu][Nn][Tt][Yy]|[Rr][Oo][Ll][Ll]|Arl\.|Royaltown|Golden Isles|Oeleans|Ballard Rd|Broward|Ward|angola|Oracle|[Hubert|1st] Ave|European|Tryon|Hill\w+ |Wil\w+|[Ss][Uu][Bb][Jj][Ee][Cc][Tt]|state line|for now|with a dick|OT |of (\s+Dayton|Talla\w+)|THE INSIDE|THE SURROUNDING|TIME|AUGHTY|[A-Z] [A-Z] |&amp; 5th)(.{,20})"], "/home/ps22344/Downloads/craig_0208", length= 50, lower_case=False)
	return results 
예제 #28
0
def characterfinder(input_dir, input_dict):
    results = []
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    for entry in input_dict:
        print entry
    characterlist = set([re.compile(" " + i + " ") for i in input_dict.keys()])
    print[i.pattern for i in characterlist]
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili)
            #result is a list of lists which contain matches for each regex/acronym
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in characterlist]
            results.append([len(matches) for matches, pattern in result])
            for matches, pattern in result:
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    for entry in {k: v for k, v in matchesdicti.items() if v > 10}:
        print entry
        tk.tokenfinder([re.sub("[\(\)]", "", entry)],
                       "/Users/ps22344/Downloads/craig_0208",
                       lower_case=False)
    return results
예제 #29
0
def rebusfinder_too(input_path):
	"""
	The rebus_too finder.
	It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count 
	instances where a writer uses "2" instead of "too". 
	"""
	predict=defaultdict(int)
	postdict=defaultdict(int)
	
	for number in [2]:
		results=[]
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					
					if (
									
					#unique items catcher
					(pre[0] in ["date"]) 
					or
					(pre[0] in ["it"] and post[0] in ["i"])
					or
					(pre[0] in ["cook"] and post[0] in ["im"])
					or
					(pre[0] in ["kids"] and post[0] in ["young"]) 
					or
					(pre[0] in ["life", "way"] and post[0] in ["short"])
					or
					(pre[0] in ["that"] and post[0] in ["hard"])
					or
					(pre[0] in ["real"] and post[0] in ["hope"])
					or
					(pre[0] in ["me"] and post[0] in ["if"])
					or
					(pre[0] in ["dogs"] and post[0] in ["if"])
					or
					(pre[0] in ["can"] and post[0] in ["but"])
					or
					(pre[0] in ["kool"] and not post[0] in ["even"])
					or
					(pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later"))
					or
					(pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"])
					or
					(pre[0] in ["real"] and post[0] in ["have"])
					or
					#BIGGER NETS
					#you be too in front of punctuation catch
					(pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"])
					or
					#this is if we know the pre-word and 2 is followed by punctuation
					# cf 'intellectualy ability 2. '
					(pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"])
					or
					#this is if we know the word to follow
					# cf 'not 2 late.' collected in postwords
					(post[0] in postwords)
					or
					#this is if we know the word to precede
					(pre[0] in prewords)
					):
					
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						results.append((pre, number, punct, post, os.path.join(input_path, pati, fil)))
						predict[pre[0]]=predict[pre[0]]+1
						postdict[post[0]]=postdict[post[0]]+1
		print "original result list is", len(results)
		seti=set(results)
		print "\n\n", seti
		print "the set is ", len(seti)
		overlap={k:results.count(k) for k in seti}
		print overlap
		print {k:overlap[k] for k in overlap if overlap[k] > 1}
		print "PRE CONTEXT"
		print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)])
		print "POST CONTEXT"
		print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])
예제 #30
0
	"""
	if replace_linebreak:
		text=linebreakregex.sub(".", text)
	text=stopregex.sub(r"\g<1> \g<2>", text)
	if remove_html:
		text=htmlregex.sub(" ", text)
	return text
	
for folder in folderlist:
	filis=[i for i in os.listdir(os.path.join(pathi,folder)) if not i.startswith(".")]
	print "Building vocab: we have {} files in folder {}".format(len(filis), folder)
	#collect a dictionary with all words
	#lowercase them    
	for fili in filis[:10]:
		inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
		inputtext=ct.adtextextractor(inputfile, fili)
		print "\n\n\npre",inputtext
		#pre-processing here
		inputtext=adcleaner(inputtext ,replace_linebreak=True, remove_html=False)
		splittext=word_tokenize(inputtext)
		splittextlo=[i.lower() for i in splittext]	
 		print "\n past", splittextlo
 		finaltext=[punctuationregex.sub("",i) for i in splittextlo]
 		finaltext=[i for i in finaltext if i and i not in ['br']]	
 		print finaltext
		



# def sentencefeeder(text):
# 	sents=sent_tokenize(text)
def matrixmachine(folderlist, featuredict, testmode, *args):

	"""
	The matrixmachine creates matrices of word frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
	print "Starting the matrixmachine"
	print "external categories: ", len(args)
	print args
	#the plus one in here is for the file id
	wordmatrix=np.empty(shape=(1,(len(featuredict)+len(args)+1)))
	print "Matrix initial shape: ", np.shape(wordmatrix)
	# making a dictionary for the categories
	# we need the zero cause the machine returns 2 items
	count=0
	catdicti=categorymachine(folderlist)[0]
	filedict={}
	featuredict={k:featuredict[k]['words'] for k in featuredict.keys()}
	featuredict={k:set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys()}
	for folder in folderlist:
		filis=[i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")]
		if testmode:
			print "\n\nRUNNING\nIN\nTEST\nMODE\n"
			filis=filis[:200]
		print "Building matrices: we have {} files in folder {}".format(len(filis), folder)
		for fili in filis:
			inputfile=codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
			inputad=ct.adtextextractor(inputfile, fili)
			#establish category
			for external_cat in args:
				cat=catdicti[ct.tagextractor(inputfile, external_cat, fili)]
			count=count+1
			filedict[count]=os.path.join(pathi, folder, fili)
			addspace=stopregex.sub(r"\g<1> \g<2>", inputad)
			splittext=nltk.word_tokenize(addspace)
			splittext=[s for s in splittext if s not in exclude]
			splittextlo=[s.lower() for s in splittext if s and not excluderegex.match(s)]
			wordcount=float(len(splittextlo))
			#for each word2vec cluster: cluster/total words
			# this is a per word frequency
			#for t in featuredict:
				#print "\n", t#featuredict[t]
				#print [splittextlo.count(i) for i in featuredict[t]]
				#if sum ([splittextlo.count(i) for i in set(featuredict[t])]) > 10:
				#	print [i for i in splittextlo if i in featuredict[t]]
			#addict={k:[i for i in v] for k,v in featuredict.items()} 
			addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()}
			addict={k:v/wordcount for k,v in addict.items()}
			#print addict
			wordvector=np.array([float(cat)]+[float(count)]+addict.values())
			#we append it to the matrix
			wordmatrix=np.append(wordmatrix, [wordvector], axis=0)
	print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype)
	print "---------------\nEnd of public service announcements\n\n"
	#"In 2D, the first dimension corresponds to rows, the second to columns."
	# we don't look at the first row cause that was just for initialization
	# the one without cats we put into the clustering algorithm
	wordmatrix_without_cat=wordmatrix[1:wordmatrix.shape[0],len(args)+1:wordmatrix.shape[1]]
	print "without", np.shape(wordmatrix_without_cat)
	wordmatrix_with_cat=wordmatrix[1:wordmatrix.shape[0],]
	print "with", np.shape(wordmatrix_with_cat)
	return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
예제 #32
0
def rebusfinder_too(input_path):
	"""
	The rebus_too finder.
	It uses a list of expressions, pre-established thru "identifying_rebus_too_1022.py", to count 
	instances where a writer uses "2" instead of "too". 
	"""
	predict=defaultdict(int)
	postdict=defaultdict(int)
	
	for number in [2]:
		results=[]
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					
					if (
									
					#unique items catcher
					(pre[0] in ["date"]) 
					or
					(pre[0] in ["it"] and post[0] in ["i"])
					or
					(pre[0] in ["cook"] and post[0] in ["im"])
					or
					(pre[0] in ["kids"] and post[0] in ["young"]) 
					or
					(pre[0] in ["life", "way"] and post[0] in ["short"])
					or
					(pre[0] in ["that"] and post[0] in ["hard"])
					or
					(pre[0] in ["real"] and post[0] in ["hope"])
					or
					(pre[0] in ["me"] and post[0] in ["if"])
					or
					(pre[0] in ["dogs"] and post[0] in ["if"])
					or
					(pre[0] in ["can"] and post[0] in ["but"])
					or
					(pre[0] in ["kool"] and not post[0] in ["even"])
					or
					(pre[0] in ["on"] and punct[0] not in [" "] and inputad.split()[inputad.split().index(pre[0])-1] == "later")# and (h[h.index(pre[0])] == "later"))
					or
					(pre[0] in ["love"] and punct[0] not in [" "] and post[0] in ["msg"])
					or
					(pre[0] in ["real"] and post[0] in ["have"])
					or
					#BIGGER NETS
					#you be too in front of punctuation catch
					(pre[0] in ["be", "b", "are", "r"] and punct[0] not in [" ", "-", ")"])
					or
					#this is if we know the pre-word and 2 is followed by punctuation
					# cf 'intellectualy ability 2. '
					(pre[0] in prewords_withpunct and punct[0] not in [" ", ")", ":"])
					or
					#this is if we know the word to follow
					# cf 'not 2 late.' collected in postwords
					(post[0] in postwords)
					or
					#this is if we know the word to precede
					(pre[0] in prewords)
					):
					
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						results.append((pre, number, punct, post, os.path.join(input_path, pati, fil)))
						predict[pre[0]]=predict[pre[0]]+1
						postdict[post[0]]=postdict[post[0]]+1
		print "original result list is", len(results)
		seti=set(results)
		print "\n\n", seti
		print "the set is ", len(seti)
		overlap={k:results.count(k) for k in seti}
		print overlap
		print {k:overlap[k] for k in overlap if overlap[k] > 1}
		print "PRE CONTEXT"
		print "\n".join([": ".join([k, unicode(predict[k])]) for k in sorted(predict, key=predict.get, reverse=True)])
		print "POST CONTEXT"
		print "\n".join([": ".join([k, unicode(postdict[k])]) for k in sorted(postdict, key=postdict.get, reverse=True)])
dir = '/Users/ps22344/Downloads/craig_0208/'  #adfiles_output_0116'

#check if we find items
starttime = time.time()

for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
    """
	this looks over the keys of a dictionary that are regex patterns. 
	it outputs findings in the corpus given in "dir" with context.
	dir needs to have subfolders. 
	the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
	"""
    print pati
    for fil in [
            i for i in os.listdir(os.path.join(dir, pati))
            if not i.startswith(".")
    ]:
        fili = codecs.open(os.path.join(dir, pati, fil), "r", "utf-8")
        inputad = ct.adtextextractor(fili.read(), fili)
        inputad = inputad.lower()
        matches = [k.findall(inputad) for k in numbersdict.keys()]
        if sum([len(i) for i in matches]) > 0:
            print "hits", sum([len(i) for i in matches]), fil
            print matches

print "our numbersdict", numbersdict

endtime = time.time()

print "This took us {} minutes".format((endtime - starttime) / 60)
def rebusfinder_too(input_path, number_dictionary):
	"""
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
	for number in number_dictionary.keys():
		#this is for comments to self
		print "PRE"
		
		#this is the regular expression to identify instances of the number studied
		numberregex=re.compile("\W([a-z]+)\s*("+punctuationregex+")?\s*("+unicode(number)+")(?:\s+)?("+punctuationregex+")?(?:\s+)?([a-z]+)\W")
		print numberregex.pattern
		#dicts to store statistics about context of number
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		#lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
		previous_patterns=[]
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=ct.adcleaner(inputad, replace_linebreak=True)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#this is needed for instance where there is no punctuation
					h=[" " if i == "" else i for i in h]
					"""
					thus
					[(u'of', 'IN'), (u'2', 'CD'), (u',', ','), (u'single', 'JJ')]
					pre, "2", optional punctuation, post
					"""
					[pre, pre_punct, number, punct, post]=pos_tag(h)
					if (post[1] in ["NNS"]) and (punct[0] in [" "]):
						print "\n\n***", [pre, number, punct, post], "**\n", os.path.join(input_path, pati, fil)
						search_pattern=[re.escape(i) for i in [pre[0],number[0], punct[0], post[0]]]
						if search_pattern not in previous_patterns:
							tk.tokenfinder(["\s*".join(search_pattern)], dir)
							previous_patterns.append(search_pattern)
						else:
							print "SEE TOKENFINDER RESULTS ABOVE\n"			
						#error catching here 
						#
				
				
				
				# for h in hits:
# 					if h[2]:#==".":
# 						print  h, os.path.join(input_path, pati, fil)
# 						print pos_tag(h), "\n"
						
					#if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context):
						#tagged=pos_tag(h), fil
						#print tagged
						#if h[2] not in [" "]:
						#	print tagged, os.path.join(input_path, pati, fil)
							#print inputad
						#h0dict[h[0]]=h0dict[h[0]]+1
 						#h2dict[h[2]]=h2dict[h[2]]+1
						#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
						#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
						#taking out trash
						# if (
# 							(tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) 
# 							or
# 							(tagged[0][1]=="IN" and h[0] not in ["out", "like"])
# 							or
# 							(tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"])
# 							or
# 							(tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"])
# 							or
# 							#this is where we screw up
# 							(tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"])
# 							or
# 							(tagged[2][1]=="IN")
# 							or
# 							(tagged[2][1]=="CC" and h[2] not in ["but"])
# 							or
# 							#we don't need this if we are to just ignore whatever goes thru all of it
# 							#TEMPTEMPTEMP
# 							(h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"])
# 							or
# 							(h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"])
# 							or 
# 							(tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])
# 							):
# 							#print "killed",tagged, "\n"
# 							pass
# 						
# 						#finding the good
# 						elif (
# 							(tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
# 							or
# 							(tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
# 							or
# 							(tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1]))
# 							or
# 							(h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"])
# 							or
# 							(h[0] == "like" and h[2] not in ["furry", "cuz", "straight"])
# 							or
# 							(h[0] in ["here"] and nounregex.match(tagged[2][1]))
# 							or
# 							#really what we are exluding here is anything non-Verb or Noun
# 							# we can consider replacing this with a regex
# 							(h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"])
# 							or 
# 							(h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"])
# 							or
# 							(h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"])
# 							):
# 							#print "hooked the plusloop", tagged
# 							print tagged
# 							results.append(tagged)
# 							h0dict[h[0]]=h0dict[h[0]]+1
#  							h2dict[h[2]]=h2dict[h[2]]+1
# 						else:
# 							pass
							#if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
							#	print tagged
								#print "elseloop", tagged
# 								h0dict[h[0]]=h0dict[h[0]]+1
# 								h2dict[h[2]]=h2dict[h[2]]+1
								#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
								#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
									
	
		
		print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		h0dict={k:v for k,v in h0dict.items() if v > 0}
		print "\n\n", number, "\npretext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
		print "\n\n", number, "\nposttext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)])

		print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values()))
		print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		return results
예제 #35
0
    return text


for folder in folderlist:
    filis = [
        i for i in os.listdir(os.path.join(pathi, folder))
        if not i.startswith(".")
    ]
    print "Building vocab: we have {} files in folder {}".format(
        len(filis), folder)
    #collect a dictionary with all words
    #lowercase them
    for fili in filis[:10]:
        inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                "utf-8").read()
        inputtext = ct.adtextextractor(inputfile, fili)
        print "\n\n\npre", inputtext
        #pre-processing here
        inputtext = adcleaner(inputtext,
                              replace_linebreak=True,
                              remove_html=False)
        splittext = word_tokenize(inputtext)
        splittextlo = [i.lower() for i in splittext]
        print "\n past", splittextlo
        finaltext = [punctuationregex.sub("", i) for i in splittextlo]
        finaltext = [i for i in finaltext if i and i not in ['br']]
        print finaltext

# def sentencefeeder(text):
# 	sents=sent_tokenize(text)
# 	#print sents
def rebusfinder(input_path, word_dictionary, number_dictionary, excluded_words):
	"""
 	This finds the word "to"  that represented as the number 2. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre_context and exclude_post_context exclude instances where a word follows (post) or precedes (pre) the "2" per regex. 
 	Procedure: 
 	Eliminate all pre and post contexts;
 	POS tag the remaining ones and eliminate certain combinations;
 	Find positives by POS tag and a word list;
 	Dismiss the remaining ones. 
 	It returns a list of positives. 
 	It print the results and give type and token counts. 
	
	"""
	#with codecs.open(word_dictionary, "r", "utf-8") as worddictionary:
	#	worddictionary=json.load(worddictionary)
	#worddictionary={k:v for k,v in worddictionary.items() if not k in excluded_words and worddictionary[k] > 1}
	for number in number_dictionary.keys():
		numberregex=re.compile("\W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W")
		#just for now
		h0dict=defaultdict(int)
		h2dict=defaultdict(int)
		print numberregex.pattern
		results=[]
		for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
			for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
				fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
				inputad=ct.adtextextractor(fili.read(), fil)
				inputad=inputad.lower()
				hits=numberregex.findall(inputad)
				#this weeds out all the phonenumbers. 
				hits=[h for h in hits if h[0] not in writtennumberdict and h[2] not in writtennumberdict]
				for h in hits:
					#print  h
					if not any (regex.match(h[2]) for regex in exclude_post_context) and not any (regex.match(h[0]) for regex in exclude_pre_context):
						tagged=pos_tag(h)
						#taking out trash
						if (
							(tagged[0][1] in ["DT", "JJS", "TO", "PRP$"]) 
							or
							(tagged[0][1]=="IN" and h[0] not in ["out", "like"])
							or
							(tagged[0][1] in ["VBG"] and h[0] not in ["talking", "responding", "waiting", "getting","looking", "going", "trying"])
							or
							(tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"] and tagged[2][1] in ["JJ"])
							or
							#this is where we screw up
							(tagged[2][1] in ["NNS"] and h[2] not in ["chat", "kiss", "go", "know", "find", "do", "c", "knees"])
							or
							(tagged[2][1]=="IN")
							or
							(tagged[2][1]=="CC" and h[2] not in ["but"])
							or
							#we don't need this if we are to just ignore whatever goes thru all of it
							#TEMPTEMPTEMP
							(h[0] in ["be", "other", "s", "type", "was", "work", "im", "baths", "you", "maybe", "big", "day", "o", "round", "ride", "avengers", "kids", "had", "number", "have", "like", "here", "size", "got", "are", "send", "only", "have", "go", "is", "bedroom", "but", "beautiful", "nice"])
							or
							(h[2] in ["face", "new", "faced", "wonderful", "must", "min", "short", "si", "br", "step", "start", "so", "out", "story", "bdrm", "other", "out", "story", "yr", "looking", "more", "but", "hrs", "bedroom"])
							or 
							(tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])
							):
							#print "killed",tagged, "\n"
							pass
						
						#finding the good
						elif (
							(tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
							or
							(tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
							or
							(tagged[2][1] in ["PRP"] and not nounregex.match(tagged[0][1]))
							or
							(h[0] == "have" and h[2] in ["browse", "force", "go", "send", "talk"])
							or
							(h[0] == "like" and h[2] not in ["furry", "cuz", "straight"])
							or
							(h[0] in ["here"] and nounregex.match(tagged[2][1]))
							or
							#really what we are exluding here is anything non-Verb or Noun
							# we can consider replacing this with a regex
							(h[0] in ["need", "me", "pics"] and tagged[2][1] not in ["JJ", "JJR", "MD"])
							or 
							(h[0] in ["momma", "women", "delighted", "tryn", "respond", "travel", "veldkum", "happness", "pool", "lots", "bbw", "willin", "luvz", "place", "time", "married", "pixs", "boy", "pictures", "brickz", "somebody", "memphis", "cell", "fear", "hoop", "open", "goes", "afraid", "speak", "lady", "needs", "attracted", "doms", "bottom", "head", "apply", "drive", "pic", "newer", "pinned", "luvs", "sumbody", "face", "due", "tryin", "line", "has", "close", "interested", "alot", "oral", "talk", "new", "girl", "up", "scared", "willing", "cam", "loves", "c**k", "out", "u", "nice", "how", "free", "hard", "hope", "able", "someone", "man", "woman", "male", "down", "love", "luv", "ready", "want", "wants"]+["talking", "responding", "waiting", "getting","looking", "lookin", "going", "trying"])
							or
							(h[2] in ["survive", "brag", "blow", "grab", "feel", "send", "connect", "hearing", "say", "read", "contact", "please", "run", "host","kno", "talk", "just", "add", "text", "chill", "hang", "date", "find", "chat", "show", "u", "meet", "her", "hear", "me", "my", "b", "know", "play", "do", "suck", "go", "get", "f**k"])
							):
							#print "hooked the plusloop", tagged
							print tagged
							results.append(tagged)
							h0dict[h[0]]=h0dict[h[0]]+1
 							h2dict[h[2]]=h2dict[h[2]]+1
						else:
							pass
							#if tagged[2][1]:#=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
							#	print tagged
								#print "elseloop", tagged
# 								h0dict[h[0]]=h0dict[h[0]]+1
# 								h2dict[h[2]]=h2dict[h[2]]+1
								#h0dict[tagged[0][1]]=h0dict[tagged[0][1]]+1
								#h2dict[tagged[2][1]]=h2dict[tagged[2][1]]+1
									
	
		
		print "We have {} items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		h0dict={k:v for k,v in h0dict.items() if v > 0}
		print "\n\n", number, "\npretext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h0dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
		print "\n\n", number, "\nposttext here be the results\n\n"
		print "\n".join([": ".join([k, unicode(h2dict[k]), ".".join(word2vecwordfinder([k], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'))]) for k in sorted(h2dict, key=h2dict.get, reverse=True)])

		print "We have {} post items with a token count of {}".format(len(h2dict.keys()), sum(h2dict.values()))
		print "We have {} pre items with a token count of {}".format(len(h0dict.keys()), sum(h0dict.values()))
		return results

#check if we find items
starttime=time.time()

for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
	"""
	this looks over the keys of a dictionary that are regex patterns. 
	it outputs findings in the corpus given in "dir" with context.
	dir needs to have subfolders. 
	the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
	"""
	print pati
	for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
		fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
		inputad=ct.adtextextractor(fili.read(), fili)
		words=ct.tokenizer(inputad)
		words=[w.lower() for w in words]
		#specific words processing for numbers: introduce space between number immediately followed by word-character
		if [w for w in words if any(k.match(w) for k in numbersdict.keys())]:
			if words.index(w) not in [0, 1, len(words) -1, len(words)-2]:
				twodict[words[words.index(w)-2]]=twodict[words[words.index(w)-2]]+1
				twodict[words[words.index(w)+2]]=twodict[words[words.index(w)+2]]+1
			if words.index(w) not in [0, len(words)-1]:
				onedict[words[words.index(w)-1]]=onedict[words[words.index(w)-1]]+1
				onedict[words[words.index(w)+1]]=onedict[words[words.index(w)+1]]+1
				outifile.write("\n".join([" ".join([words[words.index(w)-2], words[words.index(w)-1],w, words[words.index(w)+1], words[words.index(w)+2]]) for w in words if any(k.match(w) for k in numbersdict.keys()) and words.index(w) not in [0, 1, len(words)-1, len(words)-2]]))
			else:
				pass

def matrixmachine(folderlist, featuredict, testmode, *args):
    """
	The matrixmachine creates matrices of word frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
    print "Starting the matrixmachine"
    print "external categories: ", len(args)
    print args
    #the plus one in here is for the file id
    wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1)))
    print "Matrix initial shape: ", np.shape(wordmatrix)
    # making a dictionary for the categories
    # we need the zero cause the machine returns 2 items
    count = 0
    catdicti = categorymachine(folderlist)
    filedict = {}
    featuredict = {k: featuredict[k]['words'] for k in featuredict.keys()}
    featuredict = {
        k: set([i for i in featuredict[k] if not i in cluster_stopwords])
        for k in featuredict.keys()
    }
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        if testmode:
            print "\n\nRUNNING\nIN\nTEST\nMODE\n"
            filis = filis[:200]
        print "Building matrices: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputad = ct.adtextextractor(inputfile, fili)
            #establish category
            for external_cat in args:
                cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)]
            count = count + 1
            filedict[count] = os.path.join(pathi, folder, fili)
            splittext = ct.tokenizer(inputad)
            splittext = [s for s in splittext if s not in exclude]
            splittextlo = [
                s.lower() for s in splittext if s and not excluderegex.match(s)
            ]
            wordcount = float(len(splittextlo))
            #not controlling for cluster size
            #addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()}
            #controlling for cluster size
            addict = {
                k: (sum([float(splittextlo.count(i)) for i in v])) / len(v)
                for k, v in featuredict.items()
            }
            addict = {k: v / wordcount for k, v in addict.items()}
            wordvector = np.array([float(cat)] + [float(count)] +
                                  addict.values())
            #we append it to the matrix
            wordmatrix = np.append(wordmatrix, [wordvector], axis=0)
    print "Features of word matrix: shape {}, dtype {}".format(
        np.shape(wordmatrix), wordmatrix.dtype)
    print "---------------\nEnd of public service announcements\n\n"
    #"In 2D, the first dimension corresponds to rows, the second to columns."
    # we don't look at the first row cause that was just for initialization
    # the one without cats we put into the clustering algorithm
    wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0],
                                        len(args) + 1:wordmatrix.shape[1]]
    print "without", np.shape(wordmatrix_without_cat)
    wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ]
    print "with", np.shape(wordmatrix_with_cat)
    return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)
예제 #39
0
def rebusfinder(input_path, word_dictionary, number_dictionary,
                excluded_words):
    """
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It print the results and give type and token counts. 
	
	"""
    #with codecs.open(word_dictionary, "r", "utf-8") as worddictionary:
    #	worddictionary=json.load(worddictionary)
    #worddictionary={k:v for k,v in worddictionary.items() if not k in excluded_words and worddictionary[k] > 1}
    for number in number_dictionary.keys():
        numberregex = re.compile("\W([a-z]+)\s+(" + unicode(number) +
                                 ")\s+([a-z]+)\W")
        #just for now
        h0dict = defaultdict(int)
        h2dict = defaultdict(int)
        print numberregex.pattern
        for pati in [
                i for i in os.listdir(input_path) if not i.startswith(".")
        ]:
            for fil in [
                    i for i in os.listdir(os.path.join(input_path, pati))
                    if not i.startswith(".")
            ]:
                fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                                   "utf-8")
                inputad = ct.adtextextractor(fili.read(), fil)
                inputad = inputad.lower()
                hits = numberregex.findall(inputad)
                #this weeds out all the phonenumbers.
                hits = [
                    h for h in hits if h[0] not in writtennumberdict
                    and h[2] not in writtennumberdict
                ]
                for h in hits:
                    if h[0] in include_pre_context or h[
                            2] in include_post_context:
                        print h
                        h0dict[h[0]] = h0dict[h[0]] + 1
                        h2dict[h[2]] = h2dict[h[2]] + 1
                    elif h[0] not in exclude_pre_context and h[
                            2] not in exclude_post_context:
                        if h[2]:  #:=="days":
                            print h
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
        print "We have {} items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        h0dict = {k: v for k, v in h0dict.items() if v > 0}
        print "\n\n", number, "\n\posttext here be the results\n\n"
        #print "\n".join([": ".join([k, unicode(h0dict[k])]) for k in sorted(h0dict, key=h0dict.get, reverse=True)])
        print "\n".join([
            ": ".join([k, unicode(h2dict[k])])
            for k in sorted(h2dict, key=h2dict.get, reverse=True)
        ])

        print "We have {} post items with a token count of {}".format(
            len(h2dict.keys()), sum(h2dict.values()))
        print "We have {} pre items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
def rebusfinder_too(input_path, number_dictionary):
    """
 	This finds words that are represented as numbers. 
 	All combinations \W([a-z]+)\s+("+unicode(number)+")\s+([a-z]+)\W for the number put in are identified.
 	The lists exclude_pre and exclude_post word for negative contexts in 4.
 	It prints the results and give type and token counts. 
	
	"""
    for number in number_dictionary.keys():
        #this is for comments to self
        print "PRE"

        #this is the regular expression to identify instances of the number studied
        numberregex = re.compile("\W([a-z]+)\s*(" + punctuationregex +
                                 ")?\s*(" + unicode(number) + ")(?:\s+)?(" +
                                 punctuationregex + ")?(?:\s+)?([a-z]+)\W")
        print numberregex.pattern
        #dicts to store statistics about context of number
        h0dict = defaultdict(int)
        h2dict = defaultdict(int)
        #lists to store results and previous search patterns fed into tokenfinder to avoid duplicate output
        previous_patterns = []
        results = []
        for pati in [
                i for i in os.listdir(input_path) if not i.startswith(".")
        ]:
            for fil in [
                    i for i in os.listdir(os.path.join(input_path, pati))
                    if not i.startswith(".")
            ]:
                fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                                   "utf-8")
                inputad = ct.adtextextractor(fili.read(), fil)
                inputad = ct.adcleaner(inputad, replace_linebreak=True)
                inputad = inputad.lower()
                hits = numberregex.findall(inputad)
                #this weeds out all the phonenumbers.
                hits = [
                    h for h in hits if h[0] not in writtennumberdict
                    and h[2] not in writtennumberdict
                ]
                for h in hits:
                    h = ["" if i == "" else i for i in h]
                    #print "h in hits", h
                    if not any(
                            regex.match(h[2])
                            for regex in exclude_post_context) and not any(
                                regex.match(h[0])
                                for regex in exclude_pre_context):
                        tagged = pos_tag(h)
                        #print tagged
                        #if h[2] not in [" "]:
                        #print tagged, os.path.join(input_path, pati, fil)
                        #print inputad
                        h0dict[h[0]] = h0dict[h[0]] + 1
                        h2dict[h[2]] = h2dict[h[2]] + 1
                        h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1
                        h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1
                        #taking out trash
                        if ((tagged[0][1] in ["DT", "JJS", "TO", "PRP$"])
                                or (tagged[0][1] == "IN"
                                    and h[0] not in ["out", "like"]) or
                            (tagged[0][1] in ["VBG"] and h[0] not in [
                                "talking", "responding", "waiting", "getting",
                                "looking", "going", "trying"
                            ]) or (tagged[0][1] in ["VB", "VBD", "VBP", "VBZ"]
                                   and tagged[2][1] in ["JJ"]) or
                                #this is where we screw up
                            (tagged[2][1] in ["NNS"] and h[2] not in [
                                "chat", "kiss", "go", "know", "find", "do",
                                "c", "knees"
                            ]) or (tagged[2][1] == "IN") or
                            (tagged[2][1] == "CC" and h[2] not in ["but"]) or
                                # 							#we don't need this if we are to just ignore whatever goes thru all of it
                                # 							#TEMPTEMPTEMP
                            (h[0] in [
                                "be", "other", "s", "type", "was", "work",
                                "im", "baths", "you", "maybe", "big", "day",
                                "o", "round", "ride", "avengers", "kids",
                                "had", "number", "have", "like", "here",
                                "size", "got", "are", "send", "only", "have",
                                "go", "is", "bedroom", "but", "beautiful",
                                "nice"
                            ]) or (h[2] in [
                                "face", "new", "faced", "wonderful", "must",
                                "min", "short", "si", "br", "step", "start",
                                "so", "out", "story", "bdrm", "other", "out",
                                "story", "yr", "looking", "more", "but", "hrs",
                                "bedroom"
                            ]) or
                            (tagged[2][1] in ["JJ", "VBD", "VBZ", "VBG"])):
                            #print "killed",tagged, "\n"
                            pass


#
# 						#finding the good
                        elif ((tagged[2][1] in ["DT", "CD", "EX", "NNS", "VB"])
                              or
                              (tagged[2][1] in ["JJ"] and h[0] in ["opposed"])
                              or (tagged[2][1] in ["PRP"]
                                  and not nounregex.match(tagged[0][1]))
                              or (h[0] == "have" and h[2]
                                  in ["browse", "force", "go", "send", "talk"])
                              or (h[0] == "like"
                                  and h[2] not in ["furry", "cuz", "straight"])
                              or (h[0] in ["here"]
                                  and nounregex.match(tagged[2][1])) or
                              #really what we are exluding here is anything non-Verb or Noun
                              # 							# we can consider replacing this with a regex
                              (h[0] in ["need", "me", "pics"]
                               and tagged[2][1] not in ["JJ", "JJR", "MD"])
                              or (h[0] in [
                                  "momma", "women", "delighted", "tryn",
                                  "respond", "travel", "veldkum", "happness",
                                  "pool", "lots", "bbw", "willin", "luvz",
                                  "place", "time", "married", "pixs", "boy",
                                  "pictures", "brickz", "somebody", "memphis",
                                  "cell", "fear", "hoop", "open", "goes",
                                  "afraid", "speak", "lady", "needs",
                                  "attracted", "doms", "bottom", "head",
                                  "apply", "drive", "pic", "newer", "pinned",
                                  "luvs", "sumbody", "face", "due", "tryin",
                                  "line", "has", "close", "interested", "alot",
                                  "oral", "talk", "new", "girl", "up",
                                  "scared", "willing", "cam", "loves", "c**k",
                                  "out", "u", "nice", "how", "free", "hard",
                                  "hope", "able", "someone", "man", "woman",
                                  "male", "down", "love", "luv", "ready",
                                  "want", "wants"
                              ] + [
                                  "talking", "responding", "waiting",
                                  "getting", "looking", "lookin", "going",
                                  "trying"
                              ]) or (h[2] in [
                                  "survive", "brag", "blow", "grab", "feel",
                                  "send", "connect", "hearing", "say", "read",
                                  "contact", "please", "run", "host", "kno",
                                  "talk", "just", "add", "text", "chill",
                                  "hang", "date", "find", "chat", "show", "u",
                                  "meet", "her", "hear", "me", "my", "b",
                                  "know", "play", "do", "suck", "go", "get",
                                  "f**k"
                              ])):
                            print "hooked the plusloop", tagged
                            #print tagged
                            results.append(tagged)
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
                        else:
                            pass
                        if tagged[2][
                                1]:  #=="VB":# in ["VBP", "VBG"]:#=="go":#:# in ['my']:#, 'know', 'my']:#["me", "need", "man"]:# == "down":#h[2] not in ["have", "and", "like", "hear"]:
                            #print tagged
                            #print "elseloop", tagged
                            h0dict[h[0]] = h0dict[h[0]] + 1
                            h2dict[h[2]] = h2dict[h[2]] + 1
                            h0dict[tagged[0][1]] = h0dict[tagged[0][1]] + 1
                            h2dict[tagged[2][1]] = h2dict[tagged[2][1]] + 1

        print "We have {} items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        h0dict = {k: v for k, v in h0dict.items() if v > 0}
        print "\n\n", number, "\npretext here be the results\n\n"
        print "\n".join([
            ": ".join([
                k,
                unicode(h0dict[k]), ".".join(
                    word2vecwordfinder([
                        k
                    ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'
                                       ))
            ]) for k in sorted(h0dict, key=h0dict.get, reverse=True)
        ])
        print "\n\n", number, "\nposttext here be the results\n\n"
        print "\n".join([
            ": ".join([
                k,
                unicode(h2dict[k]), ".".join(
                    word2vecwordfinder([
                        k
                    ], '/Users/ps22344/Downloads/chapter2/current/clusters_74_19_45_07_31.json'
                                       ))
            ]) for k in sorted(h2dict, key=h2dict.get, reverse=True)
        ])

        print "We have {} post items with a token count of {}".format(
            len(h2dict.keys()), sum(h2dict.values()))
        print "We have {} pre items with a token count of {}".format(
            len(h0dict.keys()), sum(h0dict.values()))
        return results
예제 #41
0
def prosodycounter(input_dir):
    """
	 
	Returns a list of lists where each list contains raw and per word counts.
	
	"""
    start = time.time()

    #creating the search terms
    prosodyitems = [
        "\s(\*(?:laugh|cough|smack|giggle)\*)\s", "\W([Ee][Rr])\W",
        "\W((?:[Hh][Aa]){1,}[Hh]?)\W", "\W((?:[Hh][Uu]){1,}[Hh]?)\W",
        "\W((?:[Hh][Ee]){2,}[Hh]?)\W", "\W([Hh][Oo]{2,})\W",
        "\W([Hh][Mm]{1,})\W", "\W([Hh]e+y{2,})\W", "\W([Hh]e{2,}[Yy]+)\W",
        "\W" + anyoftheseregex("[Hh]+[Ee]+[Ll][Ll]+[Oo]+") + "\W",
        "\W([Mm]{2,}[Hh]?)\W", "\W((?:[Mm][Hh]){1,})\W", "\W([Ss][Oo]{2,})\W",
        "\W([Uu][Hh]+)\W", "\W([Uu][Mm]+)\W", "\W([Yy][Aa]+[Yy]+)\W",
        "\W([Yy]+[Aa]+[Hh]?)\W"
    ]
    excludelist = []

    #dicts to store results
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    results = []

    prosody_list = [re.compile(i) for i in prosodyitems]
    print "{} items in the prosody_list, {} unique".format(
        len(prosody_list), len(set(prosody_list)))
    print[i.pattern for i in prosody_list]
    #iterate and match
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili).lower()
            #result is a list of lists which contain matches for each regex/acronym
            wordcount = float(len(ct.tokenizer(inputad)))
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in prosody_list]
            #print result
            results.append([(len(matches), len(matches) / wordcount)
                            for matches, pattern in result])
            for matches, pattern in result:
                #print pattern
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    end = time.time()
    print "This took us {} minutes".format((end - start) / 60)
    # for u in [[x[0] for x in i] for i in results]:
    # print u
    print "shape of results, number of lists:", len(
        results), "-- length of lists", set([len(i) for i in results])
    return [[x[0] for x in i] for i in results], [[x[1] for x in i]
                                                  for i in results]