Esempio n. 1
0
 def retrydownload(self, jsonfile, urlset):
     Logger.getlogging().warning(
         'upload failed urls {num}'.format(num=len(urlset)))
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(jsonfile))
     if context.retry >= 2:
         Logger.getlogging().error('do not upload for failed again')
         for key in urlset.keys():
             Logger.getlogging().error(
                 'download {url} failed'.format(url=key))
     else:
         urls = []
         for key in urlset.keys():
             Logger.getlogging().warning(
                 'retry download {url}'.format(url=key))
             for i in range(0, urlset[key]):
                 urls.append(key)
         newurlfile = URLFileManager.getinstance().generateurlfilepath(
             context.retry + 1)
         Logger.getlogging().warning(
             'Retry download URL {file}'.format(file=newurlfile))
         if constant.POST_FILE_SUFFIX in jsonfile:
             URLManager.getinstance().storeurls(urls,
                                                constant.REQUEST_TYPE_POST)
         elif constant.WEBKIT_FILE_SUFFIX in jsonfile:
             URLManager.getinstance().storeurls(
                 urls, constant.REQUEST_TYPE_WEBKIT)
         else:
             URLManager.getinstance().storeurls(
                 urls, constant.REQUEST_TYPE_COMMON)
Esempio n. 2
0
 def preprocess(self, filepath):
     result = False
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(filepath))
     if context:
         self.conf.setchannel(context.channel)
         if context.channel == SPIDER_CHANNEL_S2:
             self.conf.setquery(context.query)
         else:
             self.conf.setquery('')
         URLFileManager.getinstance().generateurlfilepath()
         result = True
     return result
Esempio n. 3
0
 def s2upload(self, sfile):
     if FileUtility.exists(sfile):
         lines = FileUtility.readlines(sfile)
         for line in lines:
             try:
                 query = line.strip()
                 self.conf.setchannel(SPIDER_CHANNEL_S2)
                 self.conf.setquery(query)
                 URLFileManager.getinstance().generateurlfilepath()
                 allsite = self.factory.getall()
                 for site in allsite:
                     site.s2query(query)
             except:
                 Logger.printexception()
Esempio n. 4
0
 def renewfilename(self, file):
     """"""
     filename = FileUtility.getfilename(file)
     context = URLFileManager.getinstance().geturlfilecontext(filename)
     if not context:
         return False
     if self.filetime == int(time.time()):
         time.sleep(1)
     self.filetime = int(time.time())
     newfilename = filename.replace(
         re.findall('\d+', filename)[-1], str(self.filetime))
     urlsfile = self.tempurlpath + newfilename
     context.filename = urlsfile
     URLFileManager.getinstance().updateurlfilecontext(
         FileUtility.getfilename(urlsfile), context)
     return urlsfile
Esempio n. 5
0
 def s3upload(self, tiebafile):
     lines = FileUtility.readlines(tiebafile)
     querylist = []
     sitelist = []
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     for strquery in lines:
         query = strquery.split('\t')[0].strip()
         url = strquery.split('\t')[1].strip()
         Logger.getlogging().debug(query)
         Logger.getlogging().debug(url)
         self.conf.setquery(query)
         URLFileManager.getinstance().generateurlfilepath()
         querylist.append(query)
         site = self.factory.getsite(url)
         site.s2query(url)
         if site not in sitelist:
             sitelist.append(site)
Esempio n. 6
0
 def s1upload(self, sfile):
     if FileUtility.exists(sfile):
         lines = FileUtility.readlines(sfile)
         self.conf.setchannel(SPIDER_CHANNEL_S1)
         self.conf.setquery('')
         URLFileManager.getinstance().generateurlfilepath()
         for line in lines:
             try:
                 url = line.strip()
                 params = PageBasicInfo()
                 params.url = url
                 #NewsStorage.seturlinfos(params)
                 context = URLContext()
                 context.originalurl = url
                 context.type = URLContext.S1_MAIN_BODY
                 Logger.getlogging().debug(url)
                 URLManager.getinstance().storeurl(url, context,
                                                   REQUEST_TYPE_WEBKIT)
             except:
                 Logger.printexception()
Esempio n. 7
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             rows = []
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning(
                             'Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     rows.append(line)
                 if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0:
                     s1tempfile = URLFileManager.generateurlfilepath(
                     ) + constant.WEBKIT_FILE_SUFFIX
                     FileUtility.writelines(s1tempfile, rows)
                     rows = []
             if rows:
                 s1tempfile = URLFileManager.generateurlfilepath(
                 ) + constant.WEBKIT_FILE_SUFFIX
                 FileUtility.writelines(s1tempfile, rows)
                 rows = []
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
Esempio n. 8
0
 def processfile(self, jsonfile):
     if not self.preprocess(jsonfile):
         return
     method = self.requesttype(jsonfile)
     urls = self.backupfile(jsonfile)
     context = URLFileManager.getinstance().geturlfilecontext(
         FileUtility.getfilename(jsonfile))
     with open(jsonfile, 'r') as fp:
         lines = fp.readlines()
     for line in lines:
         param = self.analysis(line, method)
         if param is None:
             continue
         url = param.url
         if context.retry >= 2:
             param.lastretry = True
         if method == constant.REQUEST_TYPE_POST:
             url = json.dumps({'url': param.url, 'data': param.data})
         info = None
         if URLManager.getinstance().exist(url):
             info = URLManager.getinstance().geturlcontext(url)
             param.originalurl = info.originalurl
             param.step = info.step
             param.type = info.type
             param.customized = info.customized
         else:
             param.originalurl = param.url
             param.type = URLContext.S1_MAIN_BODY
         if SiteS2Query.REFER_URL in param.customized:
             site = self.factory.getsite(
                 param.customized[SiteS2Query.REFER_URL])
         else:
             site = self.factory.getsite(param.originalurl)
         res = site.process(param)
         if not res:
             if info:
                 URLManager.getinstance().seturlcontext(param.url, info)
         else:
             if url in urls:
                 urls[url] -= 1
                 if urls[url] == 0:
                     urls.pop(url)
     # upload failed urls
     if urls:
         self.retrydownload(jsonfile, urls)
Esempio n. 9
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1