Ejemplo n.º 1
0
class DBConfig:
    '''
    operation of database concerning file savage
    '''
    def __init__(self):
        #config database
        self.configure = Configure()
        dbpath = (self.configure.getDBPath())[1:-1]
        print "dbpath  ",dbpath
        self.cx = sq.connect(dbpath)
        self.cu = self.cx.cursor()

    def __del__(self):
        self.cx.commit()

    def init(self, home_list):
        '''
        mannual init
        '''
        self.home_list = home_list

    def initConfig(self):
        '''
        create table config and insert some data
        sitelist:
            [
                {
                    url:    urlstr,
                    name:   namestr,
                    date:   datestr
                },

            ]
        '''
        print 'init configure'
        #create configure table
        print 'init flag'
        self.__create_flag()
        strr = 'CREATE  TABLE IF NOT EXISTS configure ("siteID" INTEGER PRIMARY KEY  AUTOINCREMENT  NOT NULL , "url" CHAR NOT NULL , "name" CHAR NOT NULL)' 
        self.cu.execute(strr)
        #insert data
        for site in self.home_list:
            '''
            insert each site into configure table
            '''
            strr = "insert into configure (url, name) values('%s', '%s')" % (site['url'], site['name'])
            print strr
            self.cu.execute(strr)
            self.cx.commit()

    def __create_flag(self):
        '''
        存储中断后内存中的信息
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "flag" ("id" INTEGER PRIMARY KEY NOT NULL, "info" TEXT)'
        print strr
        self.cu.execute(strr)

    def __create_source_info(self, siteID):
        '''
        create {siteID}_source_info table 
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "source_info%d" ("docID" INTEGER PRIMARY KEY  NOT NULL , "url" CHAR, "title" CHAR, "date" DATETIME)' % siteID
        print strr
        self.cu.execute(strr)

    def __create_source(self, siteID):
        '''
        create {siteID}_source table
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "source%d" ("docID" INTEGER PRIMARY KEY  NOT NULL , "source" CHAR, "parsedSource" CHAR)' % siteID
        print strr
        self.cu.execute(strr)

    def __create_img_info(self, siteID):
        '''
        create {siteID}_img_info
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "img_info%d" ("id" INTEGER PRIMARY KEY  NOT NULL , "url" CHAR, "width" INTEGER, "height" INTEGER)' % siteID
        print strr
        self.cu.execute(strr)

    def __create_img(self, siteID):
        '''
        {siteID}_img
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "img%d" ("id" INTEGER PRIMARY KEY  NOT NULL , "source" blob)' % siteID
        print strr
        self.cu.execute(strr)


    def initSites(self):
        '''
        init tables:
            {siteID}_source_info
            {siteID}_source
            {siteID}_img_info
            {siteID}_img
        '''
        print 'init Sites'
        for siteID in range(len(self.home_list)):
            #sourceinfo
            self.__create_img(siteID)
            self.__create_img_info(siteID)
            self.__create_source(siteID)
            self.__create_source_info(siteID)

    def getSiteUrls(self):
        '''
        get all site home_urls
        '''
        strr = "select url from sites"
        return self.cu.execute(strr)
Ejemplo n.º 2
0
class DBConfig:
    '''
    operation of database concerning file savage
    '''
    def __init__(self):
        #config database
        self.configure = Configure()
        dbpath = (self.configure.getDBPath())[1:-1]
        print "dbpath  ", dbpath
        self.cx = sq.connect(dbpath)
        self.cu = self.cx.cursor()

    def __del__(self):
        self.cx.commit()

    def init(self, home_list):
        '''
        mannual init
        '''
        self.home_list = home_list

    def initConfig(self):
        '''
        create table config and insert some data
        sitelist:
            [
                {
                    url:    urlstr,
                    name:   namestr,
                    date:   datestr
                },

            ]
        '''
        print 'init configure'
        #create configure table
        print 'init flag'
        self.__create_flag()
        strr = 'CREATE  TABLE IF NOT EXISTS configure ("siteID" INTEGER PRIMARY KEY  AUTOINCREMENT  NOT NULL , "url" CHAR NOT NULL , "name" CHAR NOT NULL)'
        self.cu.execute(strr)
        #insert data
        for site in self.home_list:
            '''
            insert each site into configure table
            '''
            strr = "insert into configure (url, name) values('%s', '%s')" % (
                site['url'], site['name'])
            print strr
            self.cu.execute(strr)
            self.cx.commit()

    def __create_flag(self):
        '''
        存储中断后内存中的信息
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "flag" ("id" INTEGER PRIMARY KEY NOT NULL, "info" TEXT)'
        print strr
        self.cu.execute(strr)

    def __create_source_info(self, siteID):
        '''
        create {siteID}_source_info table 
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "source_info%d" ("docID" INTEGER PRIMARY KEY  NOT NULL , "url" CHAR, "title" CHAR, "date" DATETIME)' % siteID
        print strr
        self.cu.execute(strr)

    def __create_source(self, siteID):
        '''
        create {siteID}_source table
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "source%d" ("docID" INTEGER PRIMARY KEY  NOT NULL , "source" CHAR, "parsedSource" CHAR)' % siteID
        print strr
        self.cu.execute(strr)

    def __create_img_info(self, siteID):
        '''
        create {siteID}_img_info
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "img_info%d" ("id" INTEGER PRIMARY KEY  NOT NULL , "url" CHAR, "width" INTEGER, "height" INTEGER)' % siteID
        print strr
        self.cu.execute(strr)

    def __create_img(self, siteID):
        '''
        {siteID}_img
        '''
        strr = 'CREATE TABLE IF NOT EXISTS "img%d" ("id" INTEGER PRIMARY KEY  NOT NULL , "source" blob)' % siteID
        print strr
        self.cu.execute(strr)

    def initSites(self):
        '''
        init tables:
            {siteID}_source_info
            {siteID}_source
            {siteID}_img_info
            {siteID}_img
        '''
        print 'init Sites'
        for siteID in range(len(self.home_list)):
            #sourceinfo
            self.__create_img(siteID)
            self.__create_img_info(siteID)
            self.__create_source(siteID)
            self.__create_source_info(siteID)

    def getSiteUrls(self):
        '''
        get all site home_urls
        '''
        strr = "select url from sites"
        return self.cu.execute(strr)
Ejemplo n.º 3
0
class DBSource:
    '''
    Database operation of html image and other file source
    '''
    def __init__(self):
        self.configure = Configure()
        dbpath = (self.configure.getDBPath())[1:-1]
        self.cx = sq.connect(dbpath)
        self.cu = self.cx.cursor()
        self.siteID = -1

    def __del__(self):
        self.cx.commit()

    def init(self, siteID):
        '''
        read database connection
        '''
        self.siteID = siteID

    def saveFlag(self, info):
        '''
        存储中断后信息
        '''
        strr = "delete from flag"
        self.cu.execute(strr)
        strr = "insert into flag (info) values('%s')" % info
        self.cu.execute(strr)
        self.cx.commit()

    def saveHtml(self, info, source, parsed_source):
        '''
        save html source
        info = {
            url:    urlstr,
            title:  titlestr,
            date:   date #爬取的日期
        }
        '''
        print '-'*200
        strr = "insert into source_info%d (url, title, date) values('%s', '%s', '%s')" % (self.siteID, info['url'], info['title'], info['date'])
        self.cu.execute(strr)
        strr = "insert into source%d (source, parsedSource) values('%s', '%s')" % (self.siteID, "", parsed_source)
        print '-'*200
        #print strr
        self.cu.execute(strr)
        print '-'*200
        self.cx.commit()
        
    def saveImg(self, info, source):
        '''
        save image into database
        info = {
            url:    urlstr,
            width:  width,
            height: height
        }
        '''
        #save image info
        strr = "insert into img_info%d (url, width, height) values ('%s', '%s', '%s')" % (self.siteID, info['url'], info['width'], info['height'])
        print strr
        self.cu.execute(strr)
        self.cu.execute('insert into img%d (source) values (?) '%self.siteID,(sq.Binary(source),))
        self.cx.commit()

    def getImg(self, siteID, imgID):
        strr = "select source from img%d where id=%d" % (siteID, imgID)
        data = self.cu.execute(strr)
        print data
        return data.fetchone()
Ejemplo n.º 4
0
class DBSource:
    '''
    Database operation of html image and other file source
    '''
    def __init__(self):
        self.configure = Configure()
        dbpath = (self.configure.getDBPath())[1:-1]
        self.cx = sq.connect(dbpath)
        self.cu = self.cx.cursor()
        self.siteID = -1

    def __del__(self):
        self.cx.commit()

    def init(self, siteID):
        '''
        read database connection
        '''
        self.siteID = siteID

    def saveFlag(self, info):
        '''
        存储中断后信息
        '''
        strr = "delete from flag"
        self.cu.execute(strr)
        strr = "insert into flag (info) values('%s')" % info
        self.cu.execute(strr)
        self.cx.commit()

    def saveHtml(self, info, source, parsed_source):
        '''
        save html source
        info = {
            url:    urlstr,
            title:  titlestr,
            date:   date #爬取的日期
        }
        '''
        print '-' * 200
        strr = "insert into source_info%d (url, title, date) values('%s', '%s', '%s')" % (
            self.siteID, info['url'], info['title'], info['date'])
        self.cu.execute(strr)
        strr = "insert into source%d (source, parsedSource) values('%s', '%s')" % (
            self.siteID, "", parsed_source)
        print '-' * 200
        #print strr
        self.cu.execute(strr)
        print '-' * 200
        self.cx.commit()

    def saveImg(self, info, source):
        '''
        save image into database
        info = {
            url:    urlstr,
            width:  width,
            height: height
        }
        '''
        #save image info
        strr = "insert into img_info%d (url, width, height) values ('%s', '%s', '%s')" % (
            self.siteID, info['url'], info['width'], info['height'])
        print strr
        self.cu.execute(strr)
        self.cu.execute('insert into img%d (source) values (?) ' % self.siteID,
                        (sq.Binary(source), ))
        self.cx.commit()

    def getImg(self, siteID, imgID):
        strr = "select source from img%d where id=%d" % (siteID, imgID)
        data = self.cu.execute(strr)
        print data
        return data.fetchone()