def scanfolder(): biglist = [] for category in os.listdir('./dict'): print("Category: " + category) documents = os.listdir('./dict/' + category)[:900] stripped_documents = [Utils.testStripping('./dict/' + category + '/' + doc) for doc in documents] for strip_doc in stripped_documents: biglist.extend(strip_doc) # Remove duplicates biglist = list(set(biglist)) bigdict = {} for word in biglist: bigdict[word] = 0 with open( "./mainDict.json", "w" ) as outfile: json.dump(bigdict, outfile, indent=4, separators=(',', ': ')) return os.chdir("dict") for name in glob.glob('*/*'): bigdict.extend(Utils.testStripping(name)) #print(bigdict) print(' Current length : ',len(bigdict)) print(' Removing dupes : ') finalList = list(set(bigdict)) print(' New size : ', len(finalList)) # Make dict d = {} # Save to disk as dict for item in finalList: d[item] = 0 with open( "..\\mainDict.json", "w" ) as outfile: json.dump(d, outfile) print ("Total length : ",len(d))
def sp_getTextDocument(path): words = Utils.testStripping(path) result = {} for word in words: try: result[word] result[word] = result['word'] + 1 except: result[word] = 1 return result
def getExampleArticlesFromSubCat(subcat): # Ex: subcat = dict/rec.autos # Pick n-learning articles from subset. lst = os.listdir(subcat) lst = lst[:numberOfLearningArticlesInSubset] # Open each file and add it to a list of wooooords. subsetWords = [] for fname in lst: subsetWords.extend(Utils.testStripping(subcat + "/" + fname)) # Many words we got, time to do some counting. print("Subset word count: ", len(subsetWords)) return subsetWords
def getTextDocument(path): return Utils.testStripping(path)