Python CreateDatabase.addSingleBookDataの例

プログラミング言語: Python

名前空間/パッケージ名: src.dao.BookDao

クラス/型: CreateDatabase

メソッド/関数: addSingleBookData

hotexamples.comのコード掲載数: 2

Python CreateDatabase.addSingleBookData - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのsrc.dao.BookDao.CreateDatabase.addSingleBookDataの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

CreateDatabase(13)

addingData(8)

creatingDatabase(8)

getMaxBookID(7)

findByBookName(5)

removeBook(2)

findByIsbn_13Name(2)

addSingleBookData(2)

findBySimlarBookName(1)

findBookByIsbn(1)

findBookByPreviousMaxId(1)

findBookByNextMaxId(1)

findBookByFileName(1)

findBook(1)

findAllBook(1)

countAllBooks(1)

saveBook(1)

コード例 #1

ファイルを表示

ファイル: ebook777.py プロジェクト: rashmikeshri/Opal

class ItEbook(object):
    '''
    This class downloads first page of itebookinfo
    '''
    def __init__(self, baseUrl=None):
        '''
        Constructor
        '''
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase()
        self.header_info = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'
        }

        # book image url
        self.imageUrl = None
        self.bookUrl = None
        pass

    def getUrl(self, baseUrl):
        '''this method will find and constuct all url of url given'''
        return self.baseUrl

    def findAllBookUrl(self, subUrl=None):
        '''
        This method retrive all the book url avaialbe in the page.
        http://itebooks.website/page-2.html
        '''
        url = self.baseUrl + '/' + subUrl
        #         print url
        #         content = urllib2.urlopen(url).read()
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            skipList = [
                'HOME', 'Category', 'Animals', 'Architecture', 'Art',
                'Astronomy', 'Biography', 'Biology', 'Business', 'Chemistry',
                'Cinema', 'Cookbooks', 'Cryptography', 'Culture', 'Design',
                'Drawing', 'Economics', 'Encyclopedia and Dictionary',
                'Engineering and Technology', 'Family and Friendship',
                'Fitness', 'Gambling', 'Games', 'Hardware', 'Healthcare',
                'History', 'Hobbies', 'Information Technologies', 'IT ebooks',
                'Languages', 'Martial Arts', 'Mathematics', 'Medicine',
                'Military', 'Music', 'Novels', 'Other', 'Personality',
                'Philosophy', 'Photo', 'Physics', 'Poetry',
                'Politics and Sociology', 'Programming', 'Psychology',
                'Relationships', 'Religion', 'Science', 'Security',
                'Sexuality', 'Software', 'Sport', 'Travel', 'Web Development'
            ]
            #             with open(os.path.dirname(__file__) + os.sep + 'skipList.txt', 'r') as f:
            #                 for line in f:
            #                     skipList.append(line.rstrip('\n'))
            #                 f.close
            listOfBookName = list()
            for link in soup.find_all('a', 'title'):
                if link.text.strip() != '' and link.text not in skipList:

                    listOfBookName.append(link.text)

                    isBookAvailable = self.isBookNameAvailableInDatabase(
                        link.text)
                    #                     self.isIsbnAvailableInDatabase()
                    #                     print isBookAvailable, link.text
                    if not isBookAvailable:
                        #                         print link.text, '\t', link.get('href'), isBookAvailable
                        book, bookUrl = self.findBookDetail(link.get('href'))
                        isBookAvailable = self.isIsbnAvailableInDatabase(
                            book.isbn_13)
                        #                     print book
                        if not isBookAvailable:
                            try:
                                print 'uploading database'
                                directory_name = self.downloadEbook(
                                    book, link.get('href'), bookUrl)
                                self.updateDatabase(directory_name)
                            except:
                                print link.get('href')
                                traceback.print_exc()

    def updateDatabase(self, directory_name):
        #         self.createDatabase.creatingDatabase()
        #         self.createDatabase.addingData()
        self.createDatabase.addSingleBookData(directory_name)

    def isIsbnAvailableInDatabase(self, isbn_13=None):
        isBookPresent = False
        book = self.createDatabase.findByIsbn_13Name(isbn_13)
        if book:
            isBookPresent = True
        return isBookPresent

    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def findBookDetail(self, subUrl):
        ''' This method will download book cover.
        It will provide book object.
        http://www.ebook777.com/shut-youre-welcome/
         '''
        book = None
        #         url=self.baseUrl+'/'+subUrl
        url = subUrl
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")

            book = Book()
            book.bookDescription = soup.find(id="main-content-inner").p.text
            book.bookName = soup.find(id="main-content-inner").find(
                class_='article-details').find(class_='title').text
            book.subTitle = soup.find(id="main-content-inner").find(
                class_='article-details').find(class_='subtitle').text
            bookUrl = soup.find(id="main-content-inner").find(
                class_='download-links').find('a')['href']
            table_body = soup.find('table')
            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                if len(cols) == 3:
                    book.bookImgName = cols[0].img.attrs['alt']
                    self.imageUrl = cols[0].img.attrs['src']
                    if cols[1].text == 'Author':
                        #                         print cols[2].text
                        author = Author()
                        author.authorName = cols[2].text
                        book.authors.append(author)
#                         book.authors.append()

                if len(cols) == 2:
                    if cols[0].text == 'File size':
                        book.fileSize = cols[1].text
                    if cols[0].text == 'Year':
                        try:
                            date = datetime.strptime(cols[1].text, '%Y')
                        except:
                            date = datetime.now()
                        book.publishedOn = date
                    if cols[0].text == 'Pages':
                        book.numberOfPages = cols[1].text
                    if cols[0].text == 'Language':
                        book.inLanguage = cols[1].text
                    if cols[0].text == 'File format':
                        book.bookFormat = cols[1].text
                    if cols[0].text == 'Category':
                        book.tag = cols[1].text
                    if cols[0].text == 'File format':
                        book.bookFormat = cols[1].text
                    if cols[0].text == 'Isbn':
                        book.isbn_13 = cols[1].text

#                 print cols

        return book, bookUrl

    def downloadEbook(self, book, refUrl, bookUrl):
        directory_name = self.downloadDir()
        url = refUrl
        bookImagePath = os.path.join(directory_name, book.bookImgName)
        self.downloadBookImage(bookImagePath, self.imageUrl)
        self.writeJsonToDir(directory_name, book)

        r = requests.get(bookUrl, headers=self.header_info, timeout=30)
        print '--------------->', r.url
        bookPath = os.path.join(directory_name, bookUrl.split('/')[-1])
        with open(bookPath, 'wb') as bookFile:
            bookFile.write(r.content)
        try:
            self.extractRar(directory_name)
        except:
            traceback.print_exc()
            pass
        return directory_name

    def firefoxDownloadJob(self, book, refUrl):
        '''The function of this method is to download link of given URL.'''
        # Creating directory
        directory_name = self.downloadDir()

        # Creating Actual URL
        #         url = self.baseUrl+refUrl
        url = refUrl

        lsFiles = []

        # Checking if there are three files in this URL.
        # Creating a list of absolute files.
        if 3 == len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                if os.path.isfile(os.path.join(directory_name, sName)):
                    lsFiles.append(sName)

        # Checking if there are more than 3 files in the directory location.
        # Removing all the files from direcotry.
        elif 3 != len(os.listdir(directory_name)):
            for sName in os.listdir(directory_name):
                os.remove(directory_name + '/' + sName)

            # Downloading book cover
            bookImagePath = os.path.join(directory_name, book.bookImgName)
            self.downloadBookImage(bookImagePath, self.imageUrl)

            # writing json file
            self.writeJsonToDir(directory_name, book)
            binary = FirefoxBinary('/docs/python_projects/firefox/firefox')

            fp = webdriver.FirefoxProfile()

            fp.set_preference("webdriver.log.file", "/tmp/firefox_console")
            fp.set_preference("browser.download.folderList", 2)
            fp.set_preference('browser.download.manager.showWhenStarting',
                              False)
            fp.set_preference('browser.download.manager.focusWhenStarting',
                              False)
            fp.set_preference("browser.download.dir", directory_name)
            fp.set_preference("browser.download.manager.scanWhenDone", False)
            fp.set_preference("browser.download.manager.useWindow", False)
            #             fp.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")
            fp.set_preference(
                "browser.helperApps.neverAsk.saveToDisk",
                "application/octet-stream,application/xml,application/pdf,text/plain,text/xml,image/jpeg,text/csv,application/zip,application/x-rar-compressed"
            )
            fp.set_preference("browser.helperApps.alwaysAsk.force", False)
            fp.set_preference("browser.popups.showPopupBlocker", False)
            fp.update_preferences()
            driver = webdriver.Firefox(firefox_profile=fp,
                                       firefox_binary=binary)
            # driver.find_element_by_xpath("html/body/table/tbody/tr[2]/td/div/table/tbody/tr/td[1]/img")
            driver.get(url)
            efd_link = driver.find_element_by_css_selector(
                ".download-links > a:nth-child(1)")
            efd_link.click()

            #             efd_link.send_keys(Keys.RETURN)
            flag = True
            while (flag):
                # # checking part file
                time.sleep(10)
                lst = []
                files = []
                for sName in os.listdir(directory_name):
                    if os.path.isfile(os.path.join(directory_name, sName)):
                        lst.append(sName.split('.')[-1:][0])
                        files.append(os.path.join(directory_name, sName))
#                 print lst
                if 'part' not in lst:
                    flag = False
                    time.sleep(10)
                    driver.close()
                else:
                    # print files
                    #                     if not self.isBookDownloading(files):
                    #                         driver.close()
                    pass
        self.extractRar(directory_name)

    def downloadBookImage(self, bookImagePath=None, imageUrl=None):
        '''
        this method will download image from imageUrl location and keep it at bookImagePath
        '''
        from PIL import Image
        from StringIO import StringIO
        r = requests.get(imageUrl, headers=self.header_info, timeout=30)
        print '--------------->', r.url
        with open(bookImagePath, 'wb') as imageFile:
            imageFile.write(r.content)

    def writeJsonToDir(self, bookPath=None, book=None):
        '''
        this function will write json file to given dir.
        '''
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__

                authors.append(author)
            row2dict['authors'] = authors
            if not row2dict['isbn_13'] == None:
                if str(row2dict['isbn_13']).strip() == '':
                    row2dict['isbn_13'] = None
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()
        except:
            traceback.print_exc()

    def isBookDownloading(self, files):
        ''' This method will inform that book is getting downloading or not.'''
        # time.sleep(2)
        dic_files = {}
        time_dic_files = {}
        i = 1
        checkFlagForSize = True
        isDownloading = True
        for fl in files:
            dic_files[fl] = str(os.stat(fl).st_size)
        while (checkFlagForSize):

            time_dic_files[i] = dic_files
            i = i + 1
            if i > 4:
                size = set()
                for k in time_dic_files[i - 1]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 1][k])
                for k in time_dic_files[i - 2]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 2][k])
                for k in time_dic_files[i - 3]:
                    if 'part' in k:
                        size.add(time_dic_files[i - 3][k])
#                 print len(list(size))
                if len(list(size)) > 1:
                    isDownloading = False
            checkFlagForSize = False
        logging.info('isDownloading:')
        return isDownloading

    def startDownload(self):
        #         baseUrl = 'http://itebooks.website'
        #         baseUrl = 'http://it-ebooks.directory'
        baseUrl = 'http://www.ebook777.com'
        itebook = ItEbook(baseUrl)
        # TODO need to be updated
        logicTrue = True
        i = 1100
        while logicTrue:
            subUrl = 'page/' + str(i) + '/'
            itebook.findAllBookUrl(subUrl)
            i = i + 1
            print 'startDownload---------->', str(i)


#             if i==4:
#                 break

    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0
        return maxBookId

    def downloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name,
                                      str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name)
            os.chdir(directory_name)
        return directory_name

    def extractRar(self, directory_name):
        '''
        extracting rar file
        '''
        os.chdir(directory_name)
        #         directory_name = '/docs/new/library/8006'
        listOfFiles = [
            name for name in os.listdir(directory_name)
            if not os.path.isdir(os.path.join(directory_name, name))
        ]
        for fileName in listOfFiles:
            if fileName.endswith(".rar"):
                #                 print fileName
                directory_name
                rar = rarfile.RarFile(os.path.join(directory_name, fileName))
                #                 print rar.namelist()
                infoList = rar.infolist()
                nameList = rar.namelist()
                for name in nameList:
                    if not ((name.endswith('.html')) or
                            (name.endswith('.htm')) or
                            (name.endswith('.txt'))):
                        rar.extract(name, directory_name)
        pass

コード例 #2

ファイルを表示

class FullCircleMagazine():
    
    def __init__(self, baseUrl=None):
        self.baseUrl = baseUrl
        self.directory_name = Workspace().libraryPath
        self.createDatabase = CreateDatabase() 
        self.header_info = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0'}
        
        # book image url
        self.imageUrl = None
        self.bookUrl = None
        pass
    
    def downloadFullCircleMagazine(self, url, book=None, bookUrl=None):
        '''
        AQGPK3595C
        '''
#         url = 'http://dl.fullcirclemagazine.org/issue1_en.pdf'
#         'http://dl.fullcirclemagazine.org/issue3_en.pdf'
        directory_name = self.createDownloadDir()
        bookImagePath = os.path.join(directory_name, book.bookImgName)
        os.chdir(directory_name)
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            print r.status_code, url
            print '------->', int(r.headers["content-length"]) / 1000000
            book.fileSize = str(round(int(r.headers["content-length"]) / 1000000 , 2)) + ' MB'
            self.writeJsonToDir(directory_name, book)
            self.downloadBookImage(bookImagePath, self.imageUrl)
#             r = requests.get(bookUrl, headers=self.header_info, timeout=30)
            print '--------------->', r.url
            bookPath = os.path.join(directory_name, url.split('/')[-1])
            print bookPath
            with open(bookPath, 'wb') as bookFile:
                
                bookFile.write(r.content)
            self.updateDatabase(directory_name)
        return r.status_code, directory_name  
    
    def createBookDetail(self, bookName=None):
        book = Book()   
        book.bookName = "Full Circle "+ bookName
        book.bookFormat = 'pdf'
        book.tag = 'Technology'
        book.inLanguage = 'English'
        book.subTitle = 'Magazine'
        book.publisher = "Full Circle"
        book.bookImgName = bookName + '.jpg'
        book.hasCover = 'Yes'
        book.hasCode = 'No'
        return book
            
    def writeJsonToDir(self, bookPath=None, book=None):
        try:
            f = open(os.path.join(bookPath, 'book.json'), 'w')
            row2dict = book.__dict__
            authors = []
            if type(row2dict['publishedOn']) == datetime:
                row2dict['publishedOn'] = str(row2dict['publishedOn'])
            for a in row2dict['authors']:
                author = {}
                if type(a) == str:
                    author['authorName'] = a
                else:
                    author = a.__dict__
                
                authors.append(author)
            row2dict['authors'] = authors
            if not row2dict['isbn_13'] == None:
                if str(row2dict['isbn_13']).strip() == '':
                    row2dict['isbn_13'] = None
            f.write(json.dumps(row2dict, sort_keys=False, indent=4))
            f.close()     
        except:
            traceback.print_exc()   
            
    def downloadBookImage(self, bookImagePath=None, imageUrl=None):
        '''
        this method will download image from imageUrl location and keep it at bookImagePath
        '''
        print imageUrl
        head, data = imageUrl.split(',', 1)
        bits = head.split(';')
        mime_type = bits[0] if bits[0] else 'text/plain'
        charset, b64 = 'ASCII', False
        for bit in bits:
            if bit.startswith('charset='):
                charset = bit[8:]
            elif bit == 'base64':
                b64 = True
        
        # Do something smart with charset and b64 instead of assuming
        plaindata = data.decode("base64")
        
        # Do something smart with mime_type
        with open(bookImagePath, 'wb') as f:
            f.write(plaindata)

        print 'write image complete'
#         from PIL import Image   
#         from StringIO import StringIO
#         r = requests.get(imageUrl, headers=self.header_info, timeout=30)
#         print '--------------->', r.url
#         with open(bookImagePath, 'wb') as imageFile:
#             imageFile.write(r.content)    


    def updateDatabase(self, directory_name):
#         self.createDatabase.creatingDatabase()  
#         self.createDatabase.addingData() 
        self.createDatabase.addSingleBookData(directory_name)
           
    def isIsbnAvailableInDatabase(self, isbn_13=None):
        isBookPresent = False
        book = self.createDatabase.findByIsbn_13Name(isbn_13)
        if book:
            isBookPresent = True
        return isBookPresent
    
    def isBookNameAvailableInDatabase(self, bookName=None):
        isBookPresent = False
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent
    
    def createDownloadDir(self):
        '''
        This function will create directory to download book.
        @param number:it takes database maxId+1 to create new directory . 
        '''
        directory_name = os.path.join(self.directory_name, str(self.getMaxBookID() + 1))
        if not os.path.exists(directory_name):
            os.makedirs(directory_name,755)
            os.chdir(directory_name)
        return directory_name
    
    def getMaxBookID(self):
        maxBookId = self.createDatabase.getMaxBookID()
        if not maxBookId:
            maxBookId = 0        
        return maxBookId
    
    
    def getImageUrl(self, completeUrl, issueCount):
        print completeUrl
        imageUrl = None
        r = requests.get(completeUrl, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml")
#             print soup
            alt = soup.find(class_='issuetable').find('img')['alt']
            if alt == 'Cover for Issue '+issueCount+' in English':
                imageUrl = soup.find(class_='issuetable').find('img')['src']
                print imageUrl
        return imageUrl
    
    def startDownload(self):
        logic = True
        i = 1
        while logic:
            pdfUrl = 'http://dl.fullcirclemagazine.org/issue' + str(i) + '_en.pdf'
            completeUrl = 'http://fullcirclemagazine.org/issue-' + str(i) + '/'
            if not self.isIssuePresent(str(i)):
                self.imageUrl = self.getImageUrl(completeUrl,str(i))
                book = self.createBookDetail('Issue ' + str(i))
                status_code, directory_name = self.downloadFullCircleMagazine(book=book, url=pdfUrl)
                print completeUrl, status_code
                if status_code != 200:
                    logic = False
            i = i + 1
    
    
    def isIssuePresent(self, issue=None):
        isBookPresent = False
        bookName="Full Circle Issue " + issue
        book = self.createDatabase.findByBookName(bookName)
        if book:
            isBookPresent = True
        return isBookPresent

    def getIssueDetail(self):
        url='https://wiki.ubuntu.com/UbuntuMagazine/FullIssueIndex'
        r = requests.get(url, headers=self.header_info, timeout=30)
        if r.status_code == 200:
            soup = BeautifulSoup(r.content, "lxml") 
            tables=soup.findAll('table')
            for table in tables:
                print table