Ejemplo n.º 1
0
 def storeurl(self, url, originalurl, step, others={}):
     urlparam = URLContext()
     urlparam.url = url
     urlparam.originalurl = originalurl
     urlparam.step = step
     urlparam.type = URLContext.S1_MAIN_BODY
     urlparam.customized = others
     URLManager.getinstance().storeurl(url, urlparam, constant.REQUEST_TYPE_IMG)
Ejemplo n.º 2
0
 def storeurl(self, url, originalurl, step, others={}):
     urlparam = URLContext()
     urlparam.url = url
     urlparam.originalurl = originalurl
     urlparam.step = step
     urlparam.type = URLContext.S1_COMMENTS
     urlparam.customized = others
     URLManager.getinstance().storeurl(url, urlparam)
Ejemplo n.º 3
0
 def storeposturl(self, url, originalurl, step, data, others={}):
     urlcontext = URLContext()
     urlcontext.url = json.dumps({'url': url, 'data': urllib.urlencode(data)})
     urlcontext.originalurl = originalurl
     urlcontext.step = step
     urlcontext.type = URLContext.S1_COMMENTS
     urlcontext.customized = others
     URLManager.getinstance().storeurl(urlcontext.url, urlcontext, constant.REQUEST_TYPE_POST)
Ejemplo n.º 4
0
 def __storeqeuryurl__(self, url, step, data, customized={}):
     customized[constant.SPIDER_CHANNEL] = constant.SPIDER_CHANNEL_S2
     urlcontext = URLContext()
     urlcontext.url = json.dumps({'url': url, 'data': urllib.urlencode(data)})
     urlcontext.originalurl = self.fakeoriginalurl
     urlcontext.step = step
     urlcontext.type = URLContext.S2_QUERY
     urlcontext.customized = customized
     URLManager.getinstance().storeurl(urlcontext.url, urlcontext, constant.REQUEST_TYPE_POST)
Ejemplo n.º 5
0
 def s1upload(self, sfile):
     if FileUtility.exists(sfile):
         lines = FileUtility.readlines(sfile)
         self.conf.setchannel(SPIDER_CHANNEL_S1)
         self.conf.setquery('')
         URLFileManager.getinstance().generateurlfilepath()
         for line in lines:
             try:
                 url = line.strip()
                 params = PageBasicInfo()
                 params.url = url
                 #NewsStorage.seturlinfos(params)
                 context = URLContext()
                 context.originalurl = url
                 context.type = URLContext.S1_MAIN_BODY
                 Logger.getlogging().debug(url)
                 URLManager.getinstance().storeurl(url, context,
                                                   REQUEST_TYPE_WEBKIT)
             except:
                 Logger.printexception()
Ejemplo n.º 6
0
 def __storeqeuryurllist__(self, urllist, step, customized={}):
     for url in urllist:
         customized[constant.SPIDER_CHANNEL] = constant.SPIDER_CHANNEL_S2
         urlcontext = URLContext()
         urlcontext.url = url
         urlcontext.originalurl = self.fakeoriginalurl
         urlcontext.type = URLContext.S2_QUERY
         urlcontext.step = step
         urlcontext.customized = customized
         URLManager.getinstance().storeurl(url, urlcontext)
Ejemplo n.º 7
0
 def storeurl(self, url, originalurl, step, customized={}):
     urlparam = URLContext()
     urlparam.url = url
     urlparam.originalurl = originalurl
     urlparam.step = step
     urlparam.customized = customized
     URLManager.getinstance().storeurl(url, urlparam)
Ejemplo n.º 8
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1