def shell(filelimit = 0): #rootpath = "/home/dicle/Dicle/Tez/dataset/readingtest30/" corpuspath = "/home/dicle/Dicle/Tez/dataset/readingtest300/" rootpath = corpuspath folders = IOtools.getfoldernames_of_dir(corpuspath) foldername = "" corpus = Corpus(rootpath) singlefolder = False if len(folders) == 0: singlefolder = True if singlefolder: rootpath = corpuspath #corpus = Corpus(rootpath, foldername) starttime = datetime.now() buildcorpus(corpus, rootpath, filelimit) endtime_buildcorpus = datetime.now() print "build corpus took: ",str(endtime_buildcorpus - starttime) print "corpus length ",str(len(corpus.words))," words" else: for foldername in folders: print "Folder: ",foldername rootpath = corpuspath + os.sep + foldername + os.sep #corpus = Corpus(rootpath, foldername) starttime = datetime.now() buildcorpus(corpus, rootpath) endtime_buildcorpus = datetime.now() print "build corpus took: ",str(endtime_buildcorpus - starttime) print "corpus length ",str(len(corpus.words))," words" print "pickle-getting words" corpus.picklegetwords() print "assigning pos tags" assignPOStags(corpus) endtime_postags = datetime.now() print "postag assignment took: ",str(endtime_postags - endtime_buildcorpus) ''' get_magnitudewords_doc_matrix(corpus) adjectives = get_words_ofPOStag(corpus, "ADJ") print "numof adjectives, ",len(adjectives)," ",adjectives[:-10] get_docterm_matrix(corpus, adjectives, "adjective-doc-matrix.txt", record = True) ''' endtime = datetime.now() passtime = endtime - starttime print "Elapsed time: ",passtime," on folder ",foldername print "pickle-dumping words" endtimep = datetime.now() corpus.pickledumpwords() print "Corpus length: ",len(corpus.words) print "Elapsed time for pickle: ",str(endtimep - endtime) # PICKLE words print "pickle-getting words" corpus.picklegetwords() print "corpus first 20 words:" for word in corpus.words[:20]: word.toscreen() print "pickle-dumping words" corpus.pickledumpwords()