def scanfolder():

	biglist = []

	for category in os.listdir('./dict'):
		print("Category: " + category)
		documents = os.listdir('./dict/' + category)[:900]
		
		stripped_documents = [Utils.testStripping('./dict/' + category + '/' + doc) for doc in documents]
		
		for strip_doc in stripped_documents:
			biglist.extend(strip_doc)
		
		
	# Remove duplicates
	biglist = list(set(biglist))
	bigdict = {}
	
	for word in biglist:
		bigdict[word] = 0
	
	
	with open( "./mainDict.json", "w" ) as outfile:
		json.dump(bigdict, outfile, indent=4, separators=(',', ': '))
	
		


	return
	os.chdir("dict")
	
	for name in glob.glob('*/*'):
		bigdict.extend(Utils.testStripping(name))

	#print(bigdict)
	print(' Current length  : ',len(bigdict))
	print(' Removing dupes : ')
	finalList = list(set(bigdict))
	print(' New size : ', len(finalList))


	# Make dict
	d = {}
	# Save to disk as dict
	for item in finalList:
		d[item] = 0

	with open( "..\\mainDict.json", "w" ) as outfile:
		json.dump(d, outfile)


	print ("Total length : ",len(d))
コード例 #2
0
def sp_getTextDocument(path):

	words = Utils.testStripping(path)
	result = {}
	for word in words:

		try:
			result[word]
			result[word] = result['word'] + 1
		except:
			result[word] = 1

	return result
def getExampleArticlesFromSubCat(subcat):
	# Ex: subcat  = dict/rec.autos

	# Pick n-learning articles from subset.
	lst = os.listdir(subcat)
	lst = lst[:numberOfLearningArticlesInSubset]

	# Open each file and add it to a list of wooooords.
	subsetWords = []

	for fname in lst:
		subsetWords.extend(Utils.testStripping(subcat + "/" + fname))

	# Many words we got, time to do some counting.
	print("Subset word count: ", len(subsetWords))
	return subsetWords
コード例 #4
0
def getTextDocument(path):
    return Utils.testStripping(path)