Ejemplo n.º 1
0
 def initlocal(self):
     """"""
     for dl in SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN,
             const.SPIDER_LOCAL_DOWNLOADER_LIST).split(','):
         info = LocalDownloaderInfo()
         dl = dl.strip()
         info.ip = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN,
                                             dl + constant.DOWNLOADER_IP)
         info.port = int(
             SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN,
                                       dl + constant.DOWNLOADER_PORT))
         info.username = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_USERNAME)
         info.password = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_PASSWORD)
         info.urlpath = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_URL_PATH)
         info.donepath = SpiderConfigure.getconfig(
             const.SPIDER_LOCAL_DOMAIN, dl + constant.DOWNLOADER_DONE_PATH)
         info.localdonepath = Storage.getstoragelocation(
             const.SPIDER_DONE_TEMP_PATH)
         info.jsonpath = Storage.getstoragelocation(
             const.SPIDER_JSON_TEMP_PATH)
         self.limpls.append(LocalDownloader(info))
Ejemplo n.º 2
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     SpiderReport.puts1url(line)
         if lines > 0:
             FileUtility.copy(s1file, s1tempfile)
             SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines)
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
Ejemplo n.º 3
0
 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)
Ejemplo n.º 4
0
 def __init__(self, taskinfo=None, download_path=None):
     self.taskinfo = taskinfo
     self.maxfilenum = 100
     self.cache_path = Storage.getstoragelocation(
         const.SPIDER_DONE_TEMP_PATH)
     path = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
     if download_path:
         self.download_path = download_path
     else:
         self.download_path = PUCDownloader.DOWNLOAD_PATH.format(
             path=path, taskid=self.taskinfo.taskid)
     self.parse_tool = SpiderConfigure.getconfig(
         const.SPIDER_TENCENT_PLATFORM_DOMAIN,
         const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
     #self.json_path = Storage.getstoragelocation(const.SPIDER_JSON_TEMP_PATH)
     self.pucbackpath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_PUC_BACKUP_PATH) + self.taskinfo.taskid
     self.pucbacktoday = os.path.join(self.pucbackpath,
                                      TimeUtility.getcurrentdate())
     if not FileUtility.exists(self.pucbackpath):
         FileUtility.mkdirs(self.pucbackpath)
     if not FileUtility.exists(self.pucbacktoday):
         FileUtility.mkdirs(self.pucbacktoday)
     self.done_file = self.pucbacktoday + '/done/'
     self.json_path = self.pucbacktoday + '/json/'
     if not FileUtility.exists(self.done_file):
         FileUtility.mkdirs(self.done_file)
     if not FileUtility.exists(self.json_path):
         FileUtility.mkdirs(self.json_path)
     self.pucsavedays = 0
     self.clear()
Ejemplo n.º 5
0
 def getqueryfromdb(self):
     #指定s2 query输出文件路径
     s2file = SpiderConfigure.getinstance().gets2file()
     temppath = Storage.getstoragelocation(
         const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
     QueryStorage.getinstance().getlocalquerys(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath
Ejemplo n.º 6
0
 def gettiebaqueryfromdb(self):
     #指定s2 query输出文件路径
     tiebafile = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                           const.SPIDER_S3_INPUT_FILE)
     temppath = Storage.getstoragelocation(
         const.SPIDER_TIEBA_TEMP_PATH) + FileUtility.getfilename(tiebafile)
     QueryStorage.getinstance().getlocalquerys_tieba(
         temppath, ETLController.LOCALMACHINEFLAG)
     if FileUtility.exists(temppath):
         return temppath
    def __init__(self, taskinfo):
        self.taskinfo = taskinfo
        self.upload_url = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_UPLOAD_URL)
        self.cache_path = Storage.getstoragelocation(
            const.SPIDER_DONE_TEMP_PATH)
        path = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_OUTPUT_PATH)
        self.download_path = TencentDownloader.DOWNLOAD_PATH.format(
            path=path, taskid=self.taskinfo.taskid)

        self.parse_tool = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL)
        self.parse_tool_img = SpiderConfigure.getconfig(
            const.SPIDER_TENCENT_PLATFORM_DOMAIN,
            const.SPIDER_TENCENT_PLATFORM_PARSE_TOOL_IMG)
        self.json_path = Storage.getstoragelocation(
            const.SPIDER_JSON_TEMP_PATH)
        self.upload_file_list = {}
        self.recycle_times = 0
        self.download_file_list = []
        self.download_file_list2 = []
        self.retrytimes = int(
            SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                      const.SPIDER_UPLOAD_RETRY_TIMES))
        # 新添加的变量
        self.uploadfile_retranslist = {}
        self.outtimelimit = int(
            SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                      const.SPIDER_WAIT_PLATFORM_TIMEOUT))
        # self.outtimelimit = 10
        self.download_time = time.time()
        self.taskstatusflag = True
        self.start_time = 0
 def __init__(self, info):
     self.token = info.token
     self.appid = info.appid
     self.taskname = info.taskname
     self.jobid = ''
     #self.times = time.strftime('%Y%m%d',time.localtime())
     ts = 'start={start}000000&end={end}235959'
     t1 = time.strftime('%Y%m%d',
                        time.localtime(time.time() - 60 * 60 * 24 * 1))
     t2 = time.strftime('%Y%m%d',
                        time.localtime(time.time() - 60 * 60 * 24 * 8))
     if int(self.appid) == 180:
         self.times = ts.format(start=t2, end=t1)
     else:
         self.times = ts.format(start=t1, end=t1)
     self.download_path = Storage.getstoragelocation(
         const.SPIDER_WAIBU_TEMP_PATH)
Ejemplo n.º 9
0
 def __init__(self):
     self.upload_file_list = {}
     self.impls = []
     self.implsindex = 0
     self.initcommon()
     self.wimpls = []
     self.wimplsindoex = 0
     self.initwebkit()
     self.limpls = []
     self.limplsindex = 0
     self.initlocal()
     self.tempurlpath = Storage.getstoragelocation(
         const.SPIDER_URLS_TEMP_PATH)
     self.urlbackuppath = SpiderConfigure.getconfig(
         const.SPIDER_STORAGE_DOMAIN,
         const.SPIDER_URL_BACKUP_PATH) + TimeUtility.getcurrentdate()
     #文件下载失败重试机制
     self.retransmissionfiles = {}
     self.all_retransmissionfiles = {}
     self.retransmissionlimitnum = 3
     self.filetime = 0
Ejemplo n.º 10
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     # s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             rows = []
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning(
                             'Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     rows.append(line)
                 if lines % constant.SPIDER_S1_MAX_LINE_PER_FILE == 0:
                     s1tempfile = URLFileManager.generateurlfilepath(
                     ) + constant.WEBKIT_FILE_SUFFIX
                     FileUtility.writelines(s1tempfile, rows)
                     rows = []
             if rows:
                 s1tempfile = URLFileManager.generateurlfilepath(
                 ) + constant.WEBKIT_FILE_SUFFIX
                 FileUtility.writelines(s1tempfile, rows)
                 rows = []
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
Ejemplo n.º 11
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1
Ejemplo n.º 12
0
 def upload(self):
     FileUtility.flush()
     upfiles = FileUtility.getfilelist(
         Storage.getstoragelocation(const.SPIDER_URLS_TEMP_PATH), [])
     return self.downloader.upload(upfiles)