def seturlinfos(params):
        id = NewsStorage.getid(params.url)
        if NewsStorage.exist(params.url):
            doc = NewsStorage.getdoc(params.url)
            data = {}
            #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
                       TimeUtility.getintformtime(
                           0)) == TimeUtility.getintformtime(0):
                data[
                    SQLDAO.
                    SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                        params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
            SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                        {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                        data)
        else:
            data = {}
            data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                     params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()

            data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
            data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url
            data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query
            data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel
            data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
            SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                        SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                        SQLDAO.getvaluesfromkeys(data))
Example #2
0
 def updatedb(self):
     #此处注释请勿删除
     #wheref = '{key1}={val1} and \
     #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \
     #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))'
     #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
     #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
     #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
     #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
     #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
     #secs =self.period * 24*60*60
     #)
     where = {
         SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
         ETLController.LOCALMACHINEFLAG
     }
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where)
     colddata = []
     for result in results:
         data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         try:
             publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
             createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
             if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \
                (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60):
                 id = data[SQLDAO.SPIDER_TABLE_NEWS_ID]
                 colddata.append(result)
                 SQLDAO.getinstance().delete(
                     SQLDAO.SPIDER_TABLE_NEWS,
                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
         except:
             Logger.printexception()
             Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL],
                        constant.ERRORCODE_WARNNING_OTHERS)
     if colddata:
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     colddata,
                                     mutli=True)
 def seturlinfo(url, key=None, value=None, data={}):
     id = NewsStorage.getid(url)
     if data:
         SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                     data)
         return
     if SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE == key:
         value = TimeUtility.getuniformtime(value)
     if NewsStorage.exist(url):
         doc = NewsStorage.getdoc(url)
         tempvalue = doc.get(key, '')
         if tempvalue != value:
             data = {
                 key: value,
                 SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE: SQLDAO.gettime()
             }
             SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                         {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                         data)
     else:
         data = {}
         data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
         data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url
         data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance(
         ).getquery()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance(
              ).getchannel()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance(
              ).starttime()
         data[key] = value
         data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     SQLDAO.getvaluesfromkeys(data))
 def getcount(url, before=False):
     if not before:
         return SQLDAO.getinstance().count(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             {SQLDAO.SPIDER_TABLE_COMMENTS_URL: url})
     else:
         wheref = '{urlkey}=\"{url}\" and {datekey}<={date}'
         where = wheref.format(
             urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
             url=url,
             datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE,
             date=SQLDAO.gettime())
         return SQLDAO.getinstance().count(SQLDAO.SPIDER_TABLE_COMMENTS,
                                           where=where)
 def storeurl(url):
     id = NewsStorage.getid(url)
     if not NewsStorage.exist(url):
         data = {}
         data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
         data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url
         data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance(
         ).getquery()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance(
              ).getchannel()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance(
              ).starttime()
         data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     SQLDAO.getvaluesfromkeys(data))
Example #6
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1