Python tokenizerの例、clustertools.tokenizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: dictbuilder_1009.py プロジェクト: patrickschu/chapter2

def dictbuilder(input_path, output_file):
	"""
	reads files in input_path	
	input_path needs to have subfolders. 
	if "remove_numbers", does not count numbers (as in "\d+").
	This was used for IDing leetspeak.
	"""
	worddict=defaultdict(int)
	for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
		print pati
		for fil in [i for i in os.listdir(os.path.join(input_path, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(input_path, pati, fil), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fil)
			inputad=inputad.lower()
			tokenized=ct.tokenizer(inputad)
			tokenized=[re.sub("\W","", i) for i in tokenized]
			if remove_numbers:
				tokenized=[i for i in tokenized if not re.match("\d+", i)]
			for token in [i for i in tokenized if i]:
				worddict[token]=worddict[token]+1
	print ("\n".join([":".join((k, unicode(worddict[k]))) for k in sorted(worddict, key=worddict.get, reverse=True) if worddict[k] > 50]))
	print "We created a dictionary of {} total words with {} types".format(sum(worddict.values()), len(worddict.keys()))		
	if output_file:
		with codecs.open(output_file, "w", "utf-8") as outputfile:
			json.dump(worddict, outputfile)	
			print "Dict written to ", outputfile

コード例 #2

0

ファイルを表示

ファイル: dictbuilder_1009.py プロジェクト: patrickschu/chapter2

def dictbuilder(input_path, output_file):
    """
	reads files in input_path	
	input_path needs to have subfolders. 
	if "remove_numbers", does not count numbers (as in "\d+").
	This was used for IDing leetspeak.
	"""
    worddict = defaultdict(int)
    for pati in [i for i in os.listdir(input_path) if not i.startswith(".")]:
        print pati
        for fil in [
                i for i in os.listdir(os.path.join(input_path, pati))
                if not i.startswith(".")
        ]:
            fili = codecs.open(os.path.join(input_path, pati, fil), "r",
                               "utf-8")
            inputad = ct.adtextextractor(fili.read(), fil)
            inputad = inputad.lower()
            tokenized = ct.tokenizer(inputad)
            tokenized = [re.sub("\W", "", i) for i in tokenized]
            if remove_numbers:
                tokenized = [i for i in tokenized if not re.match("\d+", i)]
            for token in [i for i in tokenized if i]:
                worddict[token] = worddict[token] + 1
    print("\n".join([
        ":".join((k, unicode(worddict[k])))
        for k in sorted(worddict, key=worddict.get, reverse=True)
        if worddict[k] > 50
    ]))
    print "We created a dictionary of {} total words with {} types".format(
        sum(worddict.values()), len(worddict.keys()))
    if output_file:
        with codecs.open(output_file, "w", "utf-8") as outputfile:
            json.dump(worddict, outputfile)
            print "Dict written to ", outputfile

コード例 #3

0

ファイルを表示

ファイル: finding_collocations_withwordlists_1005.py プロジェクト: patrickschu/chapter2

def collofinder(main_term,regex):
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		"""
		this looks over the keys of a dictionary that are regex patterns. 
		it outputs findings in the corpus given in "dir" with context.
		dir needs to have subfolders. 
		the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
		"""
		print pati
		for fil in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fil), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			inputad=tagregex.sub(" ", inputad)
			words=ct.tokenizer(inputad)
			words=[w.lower() for w in words]
			#specific words processing for numbers: introduce space between number immediately followed by word-character
			hits=[w for w in words if regex.match(w) ]
			#determines length of context extracted
			context=[-3,-2,-1,0, 1,2, 3]
			for matched in hits:
				if [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in words:
					print "too long"
					print [words[words.index(matched)+t] for t in [c for c in context if c <1 ]]
				elif hits and not [i for i in context if words.index(matched) + i > len(words) -1 ] and search_term in [words[words.index(matched)+t] for t in [-1,1]] :
					print fil
					print [words[words.index(matched)+t] for t in context]

コード例 #4

0

ファイルを表示

ファイル: egrammartools.py プロジェクト: patrickschu/chapter2

def emoticonfinder(dir):
	"""
	The emoticonfinder takes a directory with corpus files as input. 
	We might consider making the file with emoticons an argument as well. 
	The emoticonfinder creates a list of relevant emoticons from a text file. 
	Then counts how often they occur in files in dir.
	--- Source file is /Users/ps22344/Downloads/chapter2/current/emoticoncounter.py ---
	"""
	starttime=time.time()
	#creating a featuredict from file
	featuredict={}
	with codecs.open('/Users/ps22344/Downloads/chapter2/textfiles/emolist_final.txt', "r", "utf-8") as inputtext:
		for line in inputtext.readlines():
			featuredict[line.rstrip("\n")]=0
	#test formatting
	for k in featuredict:
		if k.startswith(" "):
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		print pati
		for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			words=ct.tokenizer(inputad)
			for item in words:
				if item in featuredict:
					featuredict[item] = featuredict[item]+1
	print featuredict
	endtime=time.time()
	print "This took us {} minutes".format((endtime-starttime)/60)

コード例 #5

0

ファイルを表示

ファイル: acronymfinder_1031.py プロジェクト: patrickschu/chapter2

def acronymfinder(dir, length, output_json):
	"""
	This finds acronyms. 
	Dir is directory of files. 
	Length is length of desired acronym. 
	"""
	start=time.time()
	capitals=re.compile("^[A-Z]+$")
	featuredict=defaultdict(int)
	#{
	#'lol':0
	#}
	
	for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
		print "working on", pati
		for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
			fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
			inputad=ct.adtextextractor(fili.read(), fili)
			words=[w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)]
			for item in words:
				if (capitals.match(item)) and (len(item) == length):
					if not spell.spellchecker(item.lower()):
						featuredict[item] = featuredict[item]+1

	print sorted(featuredict.keys())
	print "SO many entries: ", len(featuredict)
	
	#sorted(d.items(), key=lambda x: x[1])
	#[":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)]
	print  "\n".join([":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)])
	mid=time.time()
	print "this took us {} minutes".format((mid-start)/60)
	if output_json:
		with codecs.open("output_acronyms"+str(length)+"letters.json", "w", "utf-8") as outputi:
			json.dump(featuredict, outputi)
	else:
		for entry in sorted(featuredict):
			if featuredict[entry] > 5:
				print "\n\n\n***",entry,"\n\n"
				tk.tokenfinder([r"\s"+entry+"\s"], input_path='/Users/ps22344/Downloads/craig_0208/', length=20, lower_case=False)
	end=time.time()
	print "this took us {} minutes".format((end-start)/60)

コード例 #6

0

ファイルを表示

ファイル: finding_collocations_withwordlists_1005.py プロジェクト: patrickschu/chapter2

def collofinder(main_term, regex):
    for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
        """
		this looks over the keys of a dictionary that are regex patterns. 
		it outputs findings in the corpus given in "dir" with context.
		dir needs to have subfolders. 
		the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
		"""
        print pati
        for fil in [
                i for i in os.listdir(os.path.join(dir, pati))
                if not i.startswith(".")
        ]:
            fili = codecs.open(os.path.join(dir, pati, fil), "r", "utf-8")
            inputad = ct.adtextextractor(fili.read(), fili)
            inputad = tagregex.sub(" ", inputad)
            words = ct.tokenizer(inputad)
            words = [w.lower() for w in words]
            #specific words processing for numbers: introduce space between number immediately followed by word-character
            hits = [w for w in words if regex.match(w)]
            #determines length of context extracted
            context = [-3, -2, -1, 0, 1, 2, 3]
            for matched in hits:
                if [
                        i for i in context
                        if words.index(matched) + i > len(words) - 1
                ] and search_term in words:
                    print "too long"
                    print[
                        words[words.index(matched) + t]
                        for t in [c for c in context if c < 1]
                    ]
                elif hits and not [
                        i for i in context
                        if words.index(matched) + i > len(words) - 1
                ] and search_term in [
                        words[words.index(matched) + t] for t in [-1, 1]
                ]:
                    print fil
                    print[words[words.index(matched) + t] for t in context]

コード例 #7

0

ファイルを表示

def wordcounter(input_dir, category_tag, category_dict):
    """
	counts the words per category in the files in input_dir.
	
	Parameters
	----------
	input_dir is the corpus directoty
	category_tag is the name of the tag to be extracted with tagextractor. 
	category_dict is a dictionary of categories to be computed over (category names as keys)
	e.g. <location="X"> would be input with "location" as the category_tag and a dict with {"Austin":0, "Dallas":0, ...}
	Returns
	-------
	something
	"""
    print "Running the wordcounter"
    resultdict = category_dict
    for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print pati
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, pati))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, pati, fili), "r",
                             "utf-8") as inputfili:
                inputfili = inputfili.read()
            wordcount = len(
                ct.tokenizer(ct.adtextextractor(inputfili, fili),
                             remove_punctuation=True))
            category = ct.tagextractor(inputfili, category_tag, fili)
            if category in resultdict:
                resultdict[category] = resultdict[category] + wordcount
            else:
                print "\n\nWARNING:\n{} is not in the category_dict. What do we do now?\n\n".format(
                    category)
    print "Wordcounter done"

    with codecs.open("wordcounter_" + category_tag + ".json", "w",
                     "utf-8") as jsonout:
        json.dump(resultdict, jsonout)

コード例 #8

0

ファイルを表示

ファイル: dictbuilder.py プロジェクト: patrickschu/chapter2

def dictbuilder(input_dir, output_name, lowercase=False, print_dict=False):
    """
	The dictbuilder puts all words in the corpus (input_dir) into a dictionary and outputs as json. 
	Name of output file determined by output_name.
	If print_dict is set to True, prints our sorted dictionary.
	Format of the dict returned: {word:count, word:count, }
	"""
    dicti = defaultdict(float)
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili)
            inputad = [
                w.rstrip(string.punctuation).lstrip(string.punctuation)
                for w in ct.tokenizer(inputad)
            ]
            inputad = [w for w in inputad if w]
            if lowercase:
                for word in inputad:
                    dicti[word.lower()] = dicti[word.lower()] + 1
            else:
                for word in inputad:
                    dicti[word] = dicti[word] + 1
    if print_dict:
        print "\n".join([
            ":".join((i, str(dicti[i])))
            for i in sorted(dicti, key=dicti.get, reverse=True)
        ])
    with codecs.open(output_name + ".json", "w", "utf-8") as outputi:
        json.dump(dicti, outputi, encoding="utf8")
    print "Written dictionary with {} items to ".format(
        len(dicti)), output_name
    return dicti

コード例 #9

0

ファイルを表示

def spellingcounter(input_dir):
    """
    The spellingcounter counts the number of mis-spelled words.
    It uses the PyEnchange library for spellchecking.
    It iterates over the files in input_dir.
    It returns a lists of lists with (raw count, relative count) tuples.
    """
    start=time.time()
    americandict = enchant.Dict("en_US")
    goodwords=set(["wo", "'ve", "'m", "n't", "'s", "'ll", "'re", "'d", "non-"]+list(string.punctuation))
    htmlregex=re.compile("<.*?>")
    results=[]
    for pati in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print pati
        for fili in [i for i in os.listdir(os.path.join(input_dir, pati)) if not i.startswith(".")]:
            #print fili
            result=[]
            fili=codecs.open(os.path.join(input_dir, pati, fili), "r", "utf-8")
            inputad=ct.adtextextractor(fili.read(), fili)
            inputad=htmlregex.sub(" ", inputad)
            words=ct.tokenizer(inputad)
            #print "\n\n\n", words
            wordcount=float(len(words))
            mistakes=[w for w in words if not americandict.check(w) and w not in goodwords]
            #print mistakes
            if wordcount-len(mistakes) < 0:
                 print "WARNING: negative count-mistakes", wordcount, len(correct), os.path.join(input_dir, pati, fili)
            results.append([(len(mistakes), len(mistakes)/wordcount)])
            #print "\n".join([":".join([i, str(dict[i])]) for i in sorted(dict, key=dict.get, reverse=True)])
    end=time.time()
    print "len results", len(results)
    print "this took us {} minutes".format((end-start)/60)
    print "shape of results, number of lists:", len(results),  "-- length of lists", set([len(i) for i in results])
    #for u in [[x[1] for x in i] for i in results]:
    #    print u
    #print [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]
    return [[x[0] for x in i] for i in results], [[x[1] for x in i] for i in results]

コード例 #10

0

ファイルを表示

ファイル: 02_analysis_cluster_word2vec_0917.py プロジェクト: patrickschu/chapter2

def matrixmachine(folderlist, featuredict, testmode, *args):

    """
	The matrixmachine creates matrices of word frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
    print "Starting the matrixmachine"
    print "external categories: ", len(args)
    print args
    # the plus one in here is for the file id
    wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1)))
    print "Matrix initial shape: ", np.shape(wordmatrix)
    # making a dictionary for the categories
    # we need the zero cause the machine returns 2 items
    count = 0
    catdicti = categorymachine(folderlist)[0]
    filedict = {}
    featuredict = {k: featuredict[k]["words"] for k in featuredict.keys()}
    featuredict = {k: set([i for i in featuredict[k] if not i in cluster_stopwords]) for k in featuredict.keys()}
    for folder in folderlist:
        filis = [i for i in os.listdir(os.path.join(pathi, folder)) if not i.startswith(".")]
        if testmode:
            print "\n\nRUNNING\nIN\nTEST\nMODE\n"
            filis = filis[:200]
        print "Building matrices: we have {} files in folder {}".format(len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r", "utf-8").read()
            inputad = ct.adtextextractor(inputfile, fili)
            # establish category
            for external_cat in args:
                cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)]
            count = count + 1
            filedict[count] = os.path.join(pathi, folder, fili)
            splittext = ct.tokenizer(inputad)
            splittext = [s for s in splittext if s not in exclude]
            splittextlo = [s.lower() for s in splittext if s and not excluderegex.match(s)]
            wordcount = float(len(splittextlo))
            # not controlling for cluster size
            # addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()}
            # controlling for cluster size
            addict = {k: (sum([float(splittextlo.count(i)) for i in v])) / len(v) for k, v in featuredict.items()}
            addict = {k: v / wordcount for k, v in addict.items()}
            wordvector = np.array([float(cat)] + [float(count)] + addict.values())
            # we append it to the matrix
            wordmatrix = np.append(wordmatrix, [wordvector], axis=0)
    print "Features of word matrix: shape {}, dtype {}".format(np.shape(wordmatrix), wordmatrix.dtype)
    print "---------------\nEnd of public service announcements\n\n"
    # "In 2D, the first dimension corresponds to rows, the second to columns."
    # we don't look at the first row cause that was just for initialization
    # the one without cats we put into the clustering algorithm
    wordmatrix_without_cat = wordmatrix[1 : wordmatrix.shape[0], len(args) + 1 : wordmatrix.shape[1]]
    print "without", np.shape(wordmatrix_without_cat)
    wordmatrix_with_cat = wordmatrix[1 : wordmatrix.shape[0],]
    print "with", np.shape(wordmatrix_with_cat)
    return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)

コード例 #11

0

ファイルを表示

def prosodycounter(input_dir):
    """
	 
	Returns a list of lists where each list contains raw and per word counts.
	
	"""
    start = time.time()

    #creating the search terms
    prosodyitems = [
        "\s(\*(?:laugh|cough|smack|giggle)\*)\s", "\W([Ee][Rr])\W",
        "\W((?:[Hh][Aa]){1,}[Hh]?)\W", "\W((?:[Hh][Uu]){1,}[Hh]?)\W",
        "\W((?:[Hh][Ee]){2,}[Hh]?)\W", "\W([Hh][Oo]{2,})\W",
        "\W([Hh][Mm]{1,})\W", "\W([Hh]e+y{2,})\W", "\W([Hh]e{2,}[Yy]+)\W",
        "\W" + anyoftheseregex("[Hh]+[Ee]+[Ll][Ll]+[Oo]+") + "\W",
        "\W([Mm]{2,}[Hh]?)\W", "\W((?:[Mm][Hh]){1,})\W", "\W([Ss][Oo]{2,})\W",
        "\W([Uu][Hh]+)\W", "\W([Uu][Mm]+)\W", "\W([Yy][Aa]+[Yy]+)\W",
        "\W([Yy]+[Aa]+[Hh]?)\W"
    ]
    excludelist = []

    #dicts to store results
    dicti = defaultdict(float)
    matchesdicti = defaultdict(list)
    results = []

    prosody_list = [re.compile(i) for i in prosodyitems]
    print "{} items in the prosody_list, {} unique".format(
        len(prosody_list), len(set(prosody_list)))
    print[i.pattern for i in prosody_list]
    #iterate and match
    for dir in [i for i in os.listdir(input_dir) if not i.startswith(".")]:
        print dir
        for fili in [
                i for i in os.listdir(os.path.join(input_dir, dir))
                if not i.startswith(".")
        ]:
            with codecs.open(os.path.join(input_dir, dir, fili), "r",
                             "utf-8") as inputtext:
                inputad = ct.adtextextractor(inputtext.read(), fili).lower()
            #result is a list of lists which contain matches for each regex/acronym
            wordcount = float(len(ct.tokenizer(inputad)))
            result = [([m for m in i.findall(inputad)
                        if not m in excludelist], i.pattern)
                      for i in prosody_list]
            #print result
            results.append([(len(matches), len(matches) / wordcount)
                            for matches, pattern in result])
            for matches, pattern in result:
                #print pattern
                #the dicti is {pattern:count, pattern: count, ...}
                dicti[pattern] = dicti[pattern] + len(matches)
                matchesdicti[pattern] = matchesdicti[pattern] + matches
    print "\n".join([
        ":".join((i, str(dicti[i]), "|".join(set(matchesdicti[i]))))
        for i in sorted(dicti, key=dicti.get, reverse=True)
    ])
    end = time.time()
    print "This took us {} minutes".format((end - start) / 60)
    # for u in [[x[0] for x in i] for i in results]:
    # print u
    print "shape of results, number of lists:", len(
        results), "-- length of lists", set([len(i) for i in results])
    return [[x[0] for x in i] for i in results], [[x[1] for x in i]
                                                  for i in results]

コード例 #12

0

ファイルを表示

ファイル: finding_collocations_singleword_1004.py プロジェクト: patrickschu/chapter2

#check if we find items
starttime=time.time()

for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
	"""
	this looks over the keys of a dictionary that are regex patterns. 
	it outputs findings in the corpus given in "dir" with context.
	dir needs to have subfolders. 
	the twodict counts words with a distance of 2, the onedict counts words with a distance of 1.
	"""
	print pati
	for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
		fili=codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
		inputad=ct.adtextextractor(fili.read(), fili)
		words=ct.tokenizer(inputad)
		words=[w.lower() for w in words]
		#specific words processing for numbers: introduce space between number immediately followed by word-character
		if [w for w in words if any(k.match(w) for k in numbersdict.keys())]:
			if words.index(w) not in [0, 1, len(words) -1, len(words)-2]:
				twodict[words[words.index(w)-2]]=twodict[words[words.index(w)-2]]+1
				twodict[words[words.index(w)+2]]=twodict[words[words.index(w)+2]]+1
			if words.index(w) not in [0, len(words)-1]:
				onedict[words[words.index(w)-1]]=onedict[words[words.index(w)-1]]+1
				onedict[words[words.index(w)+1]]=onedict[words[words.index(w)+1]]+1
				outifile.write("\n".join([" ".join([words[words.index(w)-2], words[words.index(w)-1],w, words[words.index(w)+1], words[words.index(w)+2]]) for w in words if any(k.match(w) for k in numbersdict.keys()) and words.index(w) not in [0, 1, len(words)-1, len(words)-2]]))
			else:
				pass


outifile.close()

コード例 #13

0

ファイルを表示

ファイル: 03_analysis_egrammar_1031.py プロジェクト: patrickschu/chapter2

def matrixmachine(folderlist, featuredict, testmode, *args):
    """
	The matrixmachine creates matrices of word frequencies.
	It returns 
	wordmatrix_without_cat, a matrix of word frequencies only. This is fed into clustering.
	wordmatrix_with_cat, a matrix of word frequencies, where external categories (defined in *args) are added. For later comparison of clusterings. 
	catdicti, a dictionary that maps categories to numbers used in the wordmatrix_with_cat. Created by the categorymachine(), cf for details. 
	filedict, a dictioanry that maps file names to rows in the matrix. For later comparison of clusterings. 
	It takes
	The folderlist is a collection of folders to iterate over. 
	The featuredict is a dictionary containing the words to count.
	If testmode is set to True, a short test run on a fragment of the dataset is conducted to see if this will run all the way. 
	(Note that the testmode comes all the way from main())
	The args are a number of external categories, each defined in the categorydicti created by categorymachine(). Here, usually a gender category. 
	Args will be added to the matrix_with_cat. 
	"""
    print "Starting the matrixmachine"
    print "external categories: ", len(args)
    print args
    #the plus one in here is for the file id
    wordmatrix = np.empty(shape=(1, (len(featuredict) + len(args) + 1)))
    print "Matrix initial shape: ", np.shape(wordmatrix)
    # making a dictionary for the categories
    # we need the zero cause the machine returns 2 items
    count = 0
    catdicti = categorymachine(folderlist)
    filedict = {}
    featuredict = {k: featuredict[k]['words'] for k in featuredict.keys()}
    featuredict = {
        k: set([i for i in featuredict[k] if not i in cluster_stopwords])
        for k in featuredict.keys()
    }
    for folder in folderlist:
        filis = [
            i for i in os.listdir(os.path.join(pathi, folder))
            if not i.startswith(".")
        ]
        if testmode:
            print "\n\nRUNNING\nIN\nTEST\nMODE\n"
            filis = filis[:200]
        print "Building matrices: we have {} files in folder {}".format(
            len(filis), folder)
        for fili in filis:
            inputfile = codecs.open(os.path.join(pathi, folder, fili), "r",
                                    "utf-8").read()
            inputad = ct.adtextextractor(inputfile, fili)
            #establish category
            for external_cat in args:
                cat = catdicti[ct.tagextractor(inputfile, external_cat, fili)]
            count = count + 1
            filedict[count] = os.path.join(pathi, folder, fili)
            splittext = ct.tokenizer(inputad)
            splittext = [s for s in splittext if s not in exclude]
            splittextlo = [
                s.lower() for s in splittext if s and not excluderegex.match(s)
            ]
            wordcount = float(len(splittextlo))
            #not controlling for cluster size
            #addict={k:sum([float(splittextlo.count(i))for i in v]) for k,v in featuredict.items()}
            #controlling for cluster size
            addict = {
                k: (sum([float(splittextlo.count(i)) for i in v])) / len(v)
                for k, v in featuredict.items()
            }
            addict = {k: v / wordcount for k, v in addict.items()}
            wordvector = np.array([float(cat)] + [float(count)] +
                                  addict.values())
            #we append it to the matrix
            wordmatrix = np.append(wordmatrix, [wordvector], axis=0)
    print "Features of word matrix: shape {}, dtype {}".format(
        np.shape(wordmatrix), wordmatrix.dtype)
    print "---------------\nEnd of public service announcements\n\n"
    #"In 2D, the first dimension corresponds to rows, the second to columns."
    # we don't look at the first row cause that was just for initialization
    # the one without cats we put into the clustering algorithm
    wordmatrix_without_cat = wordmatrix[1:wordmatrix.shape[0],
                                        len(args) + 1:wordmatrix.shape[1]]
    print "without", np.shape(wordmatrix_without_cat)
    wordmatrix_with_cat = wordmatrix[1:wordmatrix.shape[0], ]
    print "with", np.shape(wordmatrix_with_cat)
    return (wordmatrix_without_cat, wordmatrix_with_cat, catdicti, filedict)

コード例 #14

0

ファイルを表示

ファイル: acronymfinder_1031.py プロジェクト: patrickschu/chapter2

def acronymfinder(dir, length, output_json):
    """
	This finds acronyms. 
	Dir is directory of files. 
	Length is length of desired acronym. 
	"""
    start = time.time()
    capitals = re.compile("^[A-Z]+$")
    featuredict = defaultdict(int)
    # {
    #'lol':0
    # }

    for pati in [i for i in os.listdir(dir) if not i.startswith(".")]:
        print "working on", pati
        for fili in [i for i in os.listdir(os.path.join(dir, pati)) if not i.startswith(".")]:
            fili = codecs.open(os.path.join(dir, pati, fili), "r", "utf-8")
            inputad = ct.adtextextractor(fili.read(), fili)
            words = [w.rstrip(string.punctuation).lstrip(string.punctuation) for w in ct.tokenizer(inputad)]
            for item in words:
                if (capitals.match(item)) and (len(item) == length):
                    if not spell.spellchecker(item.lower()):
                        featuredict[item] = featuredict[item] + 1

    print sorted(featuredict.keys())
    print "SO many entries: ", len(featuredict)

    # sorted(d.items(), key=lambda x: x[1])
    # [":".join((i, str(y))) for i, y in sorted(featuredict, key=featuredict.get)]
    print "\n".join(
        [":".join((i, str(featuredict[i]))) for i in sorted(featuredict, key=featuredict.get, reverse=True)]
    )
    mid = time.time()
    print "this took us {} minutes".format((mid - start) / 60)
    if output_json:
        with codecs.open("output_acronyms" + str(length) + "letters.json", "w", "utf-8") as outputi:
            json.dump(featuredict, outputi)
    else:
        for entry in sorted(featuredict):
            if featuredict[entry] > 5:
                print "\n\n\n***", entry, "\n\n"
                tk.tokenfinder(
                    [r"\s" + entry + "\s"],
                    input_path="/Users/ps22344/Downloads/craig_0208/",
                    length=20,
                    lower_case=False,
                )
    end = time.time()
    print "this took us {} minutes".format((end - start) / 60)