Python docxDocument Examples

Programming Language: Python

Namespace/Package Name: docx

Method/Function: docxDocument

Examples at hotexamples.com: 2

Python docxDocument - 2 examples found. These are the top rated real world Python examples of docx.docxDocument extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: chunkedPhrases.py Project: adamstein/mayhem

def getText():
    dataList = []
    for f in os.listdir('unsupervised\\documents'):
        filePath = 'unsupervised\\documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"
    return dataList

Example #2

Show file

File: run.py Project: adamstein/mayhem

def main():
    #print 'Hello there'
    # Command line args are in sys.argv[1], sys.argv[2] ...
    # sys.argv[0] is the script name itself and can be ignored

    dataList = []

    for f in os.listdir('documents'):
        filePath = 'documents\\' + f
        #print filePath
        fileName, fileExtension = os.path.splitext(filePath)
        #print fileExtension
        if fileExtension.lower() == '.docx':
            print '' #'its a {0} {1}{2}'.format('word document', fileName, fileExtension)
            doc = docxDocument(filePath)
            for p in doc.paragraphs:
                dataList.append(p.text)     #print p.text
            #print "-------------------------------"
        elif fileExtension.lower() == '.pdf':
            print '' #'its a {0} {1}{2}'.format('pdf document', fileName, fileExtension)
            # with open(filePath) as f:
            #     doc = slate.PDF(f)
            #     print doc[1]
            #     exit()


            #TODO
        elif ((fileExtension.lower() == '.html') or (fileExtension.lower() == '.htm')):
            print '' #'its a {0} {1}{2}'.format('html file', fileName, fileExtension)
            with codecs.open (filePath, errors='ignore') as myfile:
                source = myfile.read()
                article = Document(source).summary()
                title = Document(source).title()
                soup = BeautifulSoup(article, 'lxml')
                final = replaceTwoOrMore((title.replace('\n', ' ').replace('\r', '') + '.' + soup.text.replace('\n', ' ').replace('\r', '')))
                dataList.append(final)
                #print '*** TITLE *** \n\"' + title + '\"\n'
                #print '*** CONTENT *** \n\"' + soup.text + '[...]\"'
        else:
            print '' # 'undectected document type'
            print '' #"-------------------------------"

    #print dataList
    #for i in dataList:
    #    print i
    cachedStopWords = stopwords.words("english")
    combined = ' '.join(dataList)

    #print combined
    bloblist = [tb(combined)]

    for i, blob in enumerate(bloblist):
        print("Top words in document {}".format(i + 1))
        scores = {word: tfidf(word, blob, bloblist) for word in blob.words if word not in nltk.corpus.stopwords.words('english')}
        #print scores
        sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
        #print sorted_words
        for word, score in sorted_words:
            print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))