def getText(): dataList = [] for f in os.listdir('unsupervised\\documents'): filePath = 'unsupervised\\documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" return dataList
def main(): #print 'Hello there' # Command line args are in sys.argv[1], sys.argv[2] ... # sys.argv[0] is the script name itself and can be ignored dataList = [] for f in os.listdir('documents'): filePath = 'documents\\' + f #print filePath fileName, fileExtension = os.path.splitext(filePath) #print fileExtension if fileExtension.lower() == '.docx': print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension) doc = docxDocument(filePath) for p in doc.paragraphs: dataList.append(p.text) #print p.text #print "-------------------------------" elif fileExtension.lower() == '.pdf': print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension) # with open(filePath) as f: # doc = slate.PDF(f) # print doc[1] # exit() #TODO elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')): print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension) with codecs.open (filePath, errors='ignore') as myfile: source = myfile.read() article = Document(source).summary() title = Document(source).title() soup = BeautifulSoup(article, 'lxml') final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', ''))) dataList.append(final) #print '*** TITLE *** \n\"' + title + '\"\n' #print '*** CONTENT *** \n\"' + soup.text + '[...]\"' else: print '' # 'undectected document type' print '' #"-------------------------------" #print dataList #for i in dataList: # print i cachedStopWords = stopwords.words("english") combined = ' '.join(dataList) #print combined bloblist = [tb(combined)] for i, blob in enumerate(bloblist): print("Top words in document {}".format(i + 1)) scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')} #print scores sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True) #print sorted_words for word, score in sorted_words: print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))