Example #1
0
 def __upload__(self, filepath):
     flag = True
     FileUtility.mkdirs(self.urlbackuppath)
     FileUtility.copy(filepath, self.urlbackuppath)
     self.upload_file_list[FileUtility.getfilename(filepath)] = []
     # if filepath.endswith(constant.POST_FILE_SUFFIX) or FileUtility.getfilelines(filepath) <= constant.REMOTE_DOWNLOADER_MIN_LINES:
     #     if self.limpls:
     #         if self.limplsindex >= len(self.limpls):
     #             self.limplsindex = 0
     #         flag = self.limpls[self.limplsindex].upload(filepath)
     #         self.limplsindex += 1
     if filepath.endswith(constant.WEBKIT_FILE_SUFFIX):
         if self.wimpls:
             if self.wimplsindoex >= len(self.wimpls):
                 self.wimplsindoex = 0
             self.wimpls[self.wimplsindoex].upload(filepath)
             self.wimplsindoex += 1
     elif self.impls:
         if self.implsindex >= len(self.impls):
             self.implsindex = 0
         flag = self.impls[self.implsindex].upload(filepath)
         self.implsindex += 1
     else:
         flag = False
         Logger.getlogging().warning('No taskid or download platform!')
     return flag
Example #2
0
    def step2(self, params):
        try:
            Logger.getlogging().info("xinhuaComments.STEP_2")
            # 将STEP_1中的commentinfo_url传下来
            newsId = params.customized['newsId']
            comments_info = json.loads(params.content)
            comments_count = comments_info['totalRows']
            NewsStorage.setcmtnum(params.originalurl, comments_count)
            page_count = comments_info['totalPage']

            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl, True)
            if cmtnum >= comments_count:
                return

            # 判断增量
            if page_count >= self.maxpages:
                page_count = self.maxpages

            for index in range(0, int(page_count)):
                commentinfo_url = xinhuaNewsComments.COMMENTS_URL_NEWS.format(
                    newsId=newsId, pid=(index + 1))
                self.storeurl(commentinfo_url, params.originalurl,
                              xinhuaNewsComments.STEP_3)
        except:
            Logger.printexception()
Example #3
0
 def substep1(self, params, formats):
     value = self.r.parse(formats, params.url)[0]
     Logger.getlogging().debug(value)
     type = value[0]
     sid = int(value[1])
     Logger.getlogging().debug(type)
     Logger.getlogging().debug(sid)
     #抓取播放量,youxi无播放量
     others = ['video', 'yule']
     if type in others:
         #针对娱乐中的专辑具体分析
         if params.originalurl.find('album') > 0:
             sid = int(self.albumfilter(params))
         url = self.CLICK_URL.format(type=type, id1=str(sid)[:3], id2=sid)
         self.storeurl(url, params.originalurl, KanKanComments.STEP_CLICK,
                       {'sid': sid})
     else:
         Logger.getlogging().warning(
             '{url} :40000 Sorry, {type} maybe others!'.format(
                 url=params.url, type=type))
     #评论中的type转换
     type = self.typeconvert(value[0], params.url)
     commentinfo_url = KanKanComments.COMMENTS_URL2 % (type, sid, 1,
                                                       self.PERPAGE)
     Logger.getlogging().debug(commentinfo_url)
     self.storeurl(commentinfo_url, params.originalurl,
                   KanKanComments.STEP_2, {
                       'type': type,
                       'sid': sid
                   })
Example #4
0
 def process(self, params):
     try:
         if params.step is AllComments.STEP_1:
             try:
                 threadid = self.r.parse('data-thread-key=\"(.*?)\"',params.content)[0];
                 comments_url = AllComments.COMMENTS_URL % (threadid, 1)
                 self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, {'threadid':threadid,'pageno':1})  
             except:
                 return
         elif params.step is AllComments.STEP_2:
             try:
                 comments = json.loads(params.content)       
                 pagetotal= int(comments['cursor']['pages'])
                 comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'],params.customized['pageno'])
                 self.storeurl(comments_url, params.originalurl, AllComments.STEP_3, 
                               {'threadid':params.customized['threadid'], 
                                'pageno':params.customized['pageno'],
                                'totalpage':pagetotal})        
             except:
                 return
                                        
         elif params.step is AllComments.STEP_3:
             try:
                 if params.customized['pageno']<=params.customized['totalpage']:
                     comments = json.loads(params.content)
                     roll=len(comments['response'])
                     ptimer=[]
                     pcontent=[]
                     for key in comments['parentPosts'].keys():
                         ptime = comments['parentPosts'][key]['created_at']
                         ptime = ptime.split("+")[0]
                         ptime = ptime.replace("T"," ")
                         ptimer.append(datetime.datetime.strptime(ptime,'%Y-%m-%d %H:%M:%S'))
                         pcontent.append(comments['parentPosts'][key]['message'])
                     for ctime in range(0,len(ptimer)):
                         ptimer[ctime]=datetime.datetime.strptime(str(ptimer[ctime]),'%Y-%m-%d %H:%M:%S')
                     index=0
                     comments = []
                     complete = False
                     for comment in pcontent:
                         cmti = CommentInfo()
                         cmti.content = comment
                         if URLStorage.storeupdatetime(params.originalurl, str(ptimer[index])):
                             comments.append(cmti)
                         else:
                             complete = True
                             break;
                         index =index+ 1
                     self.commentstorage.store(params.originalurl, comments)
                     if not complete:
                             comments_url = AllComments.COMMENTS_URL % (params.customized['threadid'], params.customized['pageno']+1)
                             self.storeurl(comments_url, params.originalurl, AllComments.STEP_2, 
                                         {'threadid':params.customized['threadid'], 
                                         'pageno':params.customized['pageno']+1,
                                         'totalpage':params.customized['totalpage']})
             except:
                 return
     except Exception, e:
         traceback.print_exc()
         Logger.getlogging().error(e.message)
Example #5
0
 def download(self):
     files = []
     if self.completed():
         return files
     Logger.getlogging().debug(self.info.donepath)
     srclist = self.sshls(self.info.donepath)
     for donefile in srclist:
         donefile = donefile.strip()
         filename = FileUtility.getfilename(donefile)
         if donefile.endswith(
                 'done') and filename not in self.download_file_list:
             self.download_file_list.append(filename)
             for upfile in self.upload_file_list.keys():
                 if filename.startswith(upfile):
                     FileUtility.mkdirs(self.info.localdonepath)
                     self.sshdownload(donefile)
                     dfile = self.info.localdonepath + FileUtility.getfilename(
                         donefile)
                     if self.info.jsonpath:
                         dfile = self.bin2json(dfile)
                     files.append(dfile)
                     self.download_time = int(time.time())
                     self.upload_file_list.pop(upfile)
                     self.uploadfile_retranslist.pop(upfile)
                     if not FileUtility.exists(dfile):
                         Logger.getlogging().error(
                             'no json file generate from done file:{done}'.
                             format(done=dfile))
                     break
     return files
Example #6
0
    def query(self, info):
        Logger.getlogging().info("AngeeksS2Query.query")
        keyvalue = Common.urlenc(info)

        # step1: 根据key, 拼出下面的url
        if int(self.querylastdays) <= 7:
            datevalue = self.WEEKLY
        elif int(self.querylastdays) <= 30:
            datevalue = self.MONTHLY
        else:
            datevalue = None

        if datevalue is None:
            urls = [
                AngeeksS2Query.QUERY_TEMPLATE_ALL.format(key=keyvalue, page=0)
            ]
        else:
            urls = [
                AngeeksS2Query.QUERY_TEMPLATE.format(key=keyvalue,
                                                     page=0,
                                                     date=datevalue)
            ]

        Logger.getlogging().debug(urls[0])
        self.__storeqeuryurllist__(urls, self.S2QUERY_FIRST_PAGE, {
            'query': info,
            'date': datevalue
        })
 def get(self, url):
     saveJson = {}
     try:
         Logger.getlogging().debug('Downloading: {url}'.format(url=url))
         request = urllib2.Request(url, headers=self.headers)
         response = urllib2.urlopen(request, timeout=self.timeout)
         code = response.getcode()
         info = response.info()
         # 判断返回的code,如果不是200,则返回空
         if code == 200:
             html = response.read()
             if (("Content-Encoding" in info) and (info['Content-Encoding'] == "gzip")):
                 html = zlib.decompress(html, 16 + zlib.MAX_WBITS);
             Logger.getlogging().debug('Request Sucessed: {url}'.format(url=url))
         else:
             Logger.getlogging().error('open {url} error, code = {code}'.format(url=url, code=code))
             Logger.getlogging().error('Request Failed: {url}'.format(url=url))
             return None
     except:
         Logger.getlogging().error('Request   Failed: {url}'.format(url=url))
         Logger.printexception()
         return None
     charset = RegexUtility.getid('charset', html)
     html = Common.trydecode(html, charset)
     saveJson['foundin'] = Common.urlenc(url)
     saveJson['html'] = Common.urlenc(html.encode(constant.CHARSET_UTF8))
     saveJson['crawler_time'] = int(time.time())
     jsonStr = json.dumps(saveJson)
     return jsonStr     
 def process(self, params):
     try:
         if params.step is AllComments.STEP_1:
             key = int(re.findall("\d+", params.url.split("/")[-1])[0])
             comments_url = AllComments.COMMENTS_URL % (key)
             self.storeurl(comments_url, params.originalurl,
                           AllComments.STEP_2, {'key': key})
         elif params.step is AllComments.STEP_2:
             jsoncontent = self.r.parse('data\((.*?)\)', params.content)[0]
             comments = json.loads(jsoncontent)
             pcontent = []
             ptime = []
             index = 0
             for index in range(0, len(comments['comments'])):
                 pcontent.append(
                     comments['comments'][index]['comment_content'])
                 ptime.append(comments['comments'][index]['comment_date'])
             dataresult = {}
             for i in range(len(pcontent)):
                 dataresult[ptime[i]] = pcontent[i]
             comments = []
             dataresult = sorted(dataresult.iteritems(),
                                 key=lambda dataresult: dataresult[0],
                                 reverse=True)
             for k in range(0, len(dataresult)):
                 if URLStorage.storeupdatetime(params.originalurl,
                                               dataresult[k][0]):
                     cmti = CommentInfo()
                     cmti.content = dataresult[k][1]
                     comments.append(cmti)
             self.commentstorage.store(params.originalurl, comments)
     except Exception, e:
         traceback.print_exc()
         Logger.getlogging().error(e.message)
Example #9
0
 def step2_2(self, params):
     """"""
     try:
         jsondata = json.loads(params.content)
         data = jsondata['data']
         soup = BeautifulSoup(data, 'html5lib')
         divs = soup.select('.comment')
     except:
         Logger.getlogging().warning(
             '{url}:30000 No comments'.format(url=params.originalurl))
         return
     #comments_total = len(divs)
     #cmtnum = URLStorage.getcmtnum(params.originalurl)
     #if cmtnum >= comments_total:
     #return
     #URLStorage.setcmtnum(params.originalurl, comments_total)
     comments = []
     #divs.reverse()
     for div in divs:
         cmti = CommentInfo()
         cmti.content = div.find(attrs={
             'style': re.compile('padding-top')
         }).get_text().strip()
         tm = div.select_one('.show-time').get_text()
         tm = getuniformtime(tm)
         if not tm:
             continue
         if URLStorage.storeupdatetime(params.originalurl, tm):
             comments.append(cmti)
     if len(comments) > 0:
         # 保存获取的评论
         self.commentstorage.store(params.originalurl, comments)
Example #10
0
 def s2query(self):
     self.conf.setchannel(SPIDER_CHANNEL_S2)
     s2file = SpiderConfigure.getinstance().gets2file()
     file = FileUtility.getfilename(s2file)
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH) + file
     if FileUtility.exists(s2temppath):
         with open(s2temppath, 'r') as fp:
             querylist = []
             firstline = True
             for strquery in fp.readlines():
                 if firstline:
                     firstline = False
                     if strquery[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         strquery = strquery[3:]
                 strquery = Common.strip(strquery)
                 if not strquery:
                     continue
                 Logger.getlogging().info('S2 {query} start...'.format(query=strquery))
                 self.conf.setquery(strquery)
                 URLStorage.updaterecycle()
                 querylist.append(strquery)
                 for site in self.factory.getall():
                     site.s2query(strquery.replace('&', ' '))
             sitelist = []
             for site in self.factory.getall():
                 if site.exists2():
                     sitelist.append(site)
             SpiderReport.loadquery(querylist)
             SpiderReport.loadsites(sitelist)
Example #11
0
 def copyfiles(self):
     # s1/s2输入路径
     s1file = SpiderConfigure.getinstance().gets1file()
     s2file = SpiderConfigure.getinstance().gets2file()
     # s1/s2历史路径
     self.conf.setchannel(SPIDER_CHANNEL_S1)
     s1tempfile = URLStorage.updaterecycle() + constant.WEBKIT_FILE_SUFFIX
     s2temppath = Storage.getstoragelocation(const.SPIDER_QUERY_TEMP_PATH)
     if FileUtility.exists(s1file):
         lines = 0
         firstline = True
         with open(s1file, 'r') as fp:
             for line in fp.readlines():
                 line = line.strip()
                 if firstline:
                     firstline = False
                     if line[:3] == codecs.BOM_UTF8:
                         Logger.getlogging().warning('Remove BOM from {file}!'.format(file=file))
                         line = line[3:]
                 if line:
                     lines += 1
                     SpiderReport.puts1url(line)
         if lines > 0:
             FileUtility.copy(s1file, s1tempfile)
             SpiderReport.update(SPIDER_CHANNEL_S1, '', SpiderReport.URL_UPLOAD, lines)
     if FileUtility.exists(s2file):
         FileUtility.copy(s2file, s2temppath)
Example #12
0
    def step1(self, params):
        info = params.customized['info'] 
        pages_num = params.customized['pages_num'] 
        soup = BeautifulSoup(params.content,'html5lib')
        #print soup
        if soup.find(attrs={"id":re.compile('noresult_part._container')}) and int(pages_num) == 1:
            Logger.getlogging().warning('{0}:40000 No urllist!'.format(params.url))
            return
        pages = soup.find_all(attrs={'id':re.compile('sogou_page_.*')})
        if not pages and int(pages_num) == 1:
            self.step2(params)
            return
        nexted = soup.select_one('#sogou_next')
        temp = pages_num
        #重新刷新最新页面
        if nexted:
            pages_num = int(pages[-1].get_text())
        elif not soup.find(attrs={"id":re.compile('noresult_part._container')}):
            pages_num = int(pages[-1].get_text())
            if pages_num <= temp:
                pages_num = temp       

        if pages_num >= self.maxpages:
            pages_num = self.maxpages
        querylist = [] 
        
        #第一页最大为10,以后每次最大值为递增5
        maxpage = 10+int(math.ceil(float(pages_num-10)/5))*5 
        if not nexted or pages_num == self.maxpages or (nexted and pages_num < max(pages_num, 10) ):
            for page in range(1,pages_num+1):
                querylist.append(Newstencent.COMMON_URL.format(info=info, page=page))
            self.__storeqeuryurllist__(querylist, self.NEWS_EACH)
            return
        querylist.append(Newstencent.COMMON_URL.format(info=info, page=pages_num))
        self.__storeqeuryurllist__(querylist, self.NEWS_FIRST, {'info': info,'pages_num':pages_num})     
Example #13
0
 def step3(self,params):
     soup = BeautifulSoup(params.content, 'html5lib')
     if soup.find(attrs={"id":re.compile('noresult_part._container')}):
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     results = soup.select('.results > .vrwrap')
     if not results:
         Logger.getlogging().warning('{url}:40000 No results'.format(url=params.url))
         return 
     urllist = []
     for item in results:
         try:
             if not item.select_one('h3.vrTitle > a'):
                 continue
             if item.select_one('#hint_container'):
                 continue
             title = item.select_one('h3.vrTitle > a').get_text()
             href = item.select_one('h3.vrTitle > a').get('href')
             timestr = item.select_one('.news-detail > .news-info > .news-from').get_text()
             times = getuniformtime(timestr)
             Logger.getlogging().debug('title:'+ title)
             Logger.getlogging().debug('time:'+ times)
             if compareNow(times, self.querylastdays):
                 Logger.getlogging().debug('href:'+ href)
                 urllist.append(href) 
         except:
             Logger.printexception()
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_NEWS)            
    def flush():
        # dump s1 download failed url
        SpiderConfigure.getinstance().setchannel(constant.SPIDER_CHANNEL_S1)
        SpiderConfigure.getinstance().setquery('')
        for url in SpiderReport.getinstance().s1urls:
            Logger.log(url, constant.ERRORCODE_FAIL_LOAD_DOWN)
        # dump none url got from website for query
        querynositemap = {}
        for query in SpiderReport.getinstance().querysitesmap.keys():
            querynositemap[query] = 0
            for site in SpiderReport.getinstance().querysitesmap[query]:
                SpiderReport.s2queryurl(query, site, None, True)
                querynositemap[query] += 1
#
        for query in SpiderReport.getinstance().querysitesmap.keys():
            if query in querynositemap:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum - querynositemap[query], True)
            else:
                SpiderReport.s2queryurl(query, SpiderReport.getinstance().s2sitenum,
                                        SpiderReport.getinstance().s2sitenum, True)
#
        # report
        filename = SpiderConfigure.getconfig(const.SPIDER_STORAGE_DOMAIN,
                                             const.SPIDER_INFO_REPORT_FILE).format(
            date=TimeUtility.getcurrentdate())
        FileUtility.remove(filename)
        FileUtility.writeline(filename, SpiderReport.REPORT_FORMAT.format(
            ch='CHANNEL',
            query='QUERY',
            type='TYPE',
            v1='UPLOAD',
            v2='DOWNLOAD',
            v3='NO_TEMPLATE',
            v4='NO_SITE',
            v5='WITH_CMT',
            v6='FAILED'
        ))
        for key in SpiderReport.getinstance().reportlist.keys():
            for type in SpiderReport.getinstance().reportlist[key].keys():
                r = SpiderReport.getinstance().reportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        for key in SpiderReport.getinstance().s2sitereportlist.keys():
            for type in SpiderReport.getinstance().s2sitereportlist[key].keys():
                r = SpiderReport.getinstance().s2sitereportlist[key][type]
                FileUtility.writeline(filename, r.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring())
        FileUtility.writeline(filename, SpiderReport.getinstance().totalreport.tostring2())
        FileUtility.flush()
        threshold = float(SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                                    const.SPIDER_FAILED_THRESHOLD))
        rate = SpiderReport.getinstance().totalreport.getsuccess()
        if rate < threshold:
            Logger.getlogging().warning('success rate is lower than threshold')
            param = NotifyParam()
            param.code = NotifyParam.SPIDER_NOTIFY_OVER_FAILED
            param.message = 'success rate {rate} is lower than threshold {th}'.format(rate=Common.float2percent(rate),
                                                                                      th=Common.float2percent(
                                                                                          threshold))
            SpiderNotify.notify(param)
Example #15
0
 def analysis(self, line, method):
     try:
         js = json.loads(line)
         param = ProcessParam()
         param.crawler_time = TimeUtility.getuniformtime(js['crawler_time'])
         param.url = Common.urldec(js['foundin'])
         param.content = js['html']
         if method == constant.REQUEST_TYPE_POST:
             param.data = js['data']
         if js['html'][:3] == constant.GZIP_CODE:
             param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
         # decode
         content = Common.urldec(param.content)
         charset = RegexUtility.getid('charset', content)
         content = Common.trydecode(content, charset)
         param.content = content
         return param
     except:
         line = line.replace('\n', '').strip()
         if not line or line[0] == '#':
             return
         Logger.getlogging().debug(line)
         param = ProcessParam()
         param.url = line
         if method == constant.REQUEST_TYPE_POST:
             js = json.loads(line)
             param.url = js['url']
             param.data = js['data']
         param.content = HttpCacher.getcontent(line, method)
         if param.content is None:
             return
         return param
Example #16
0
    def step2(self, params):
        """"""
        print params.content
        try:
            jsondata = json.loads(params.content)
            comments_total = int(jsondata['comments_total'])
            comments_data = jsondata['comments']
        except:
            Logger.getlogging().warning(
                '{url}:30000 No comments'.format(url=params.originalurl))
            return
        #cmtnum = URLStorage.getcmtnum(params.originalurl)
        #if cmtnum >= comments_total:
        #return
        #URLStorage.setcmtnum(params.originalurl, comments_total)

        comments = []
        for comment in comments_data:
            cmti = CommentInfo()
            cmti.content = comment['txtcontent']
            tm = comment['addtime']
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)
        if len(comments) > 0:
            # 保存获取的评论
            self.commentstorage.store(params.originalurl, comments)

        self.post_data['p'] = str(int(self.data['p'] + self.page_size))
        self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S')
        self.storeposturl(self.post_url, params.originalurl, self.STEP_2,
                          self.post_data)
 def j_step2(self, proparam):
     Logger.getlogging().info("Comments163.STEP_1_5")
     #http://sdk.comment.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/C22OL63405178D8P
     productKey = proparam.customized['productKey']
     docId = proparam.customized['docId']
     field = proparam.customized['field']
     try:
         commentsinfo = json.loads(proparam.content)
         ugcarr = commentsinfo['ugc'].split(',')
     except:
         Logger.getlogging().warning('{0}:30000 No comments'.format(
             proparam.originalurl))
         return
     if len(ugcarr) < 3:
         if ugcarr[0].strip() == 'comment_bbs':
             # 没有评论,直接返回
             return
         ugcval = ugcarr[0].split('_')
         field = ugcval[0].strip()
     else:
         field = ugcarr[2].strip()
     commentinfo_url = 'http://sdk.comment.163.com/api/v1/products/{key}/threads/{docid}'.format(
         key=productKey, docid=docId)
     self.storeurl(commentinfo_url, proparam.originalurl,
                   JComments.J_STEP_3, {
                       'productKey': productKey,
                       'docId': docId,
                       'field': field
                   })
Example #18
0
    def step2bbs(self, params):
        Logger.getlogging().info("Dm5Commnets.STEP_2")
        # 将STEP_1中的docurl传下来
        docurl = params.customized['docurl']

        comments_count = self.r.parse(ur'(\d+)个回复', params.content)[0]
        # 判断增量
        cmtnum = URLStorage.getcmtnum(params.originalurl)
        if cmtnum >= comments_count:
            return
        URLStorage.setcmtnum(params.originalurl, comments_count)

        # 总数除以page_size,然后加1,可得到评论总页数comments_count
        pagenum = 0
        xparser = XPathUtility(params.content)
        if not xparser.xpath('//*[@class="inkk ma5"]'):
            Logger.getlogging().warning('{0}:30001'.format(params.originalurl))
            return
        pageList = xparser.xpath('//*[@id="search_fy"]/a/text()')
        if not pageList:
            pagenum = 1
        else:
            pagenum = int(pageList[-2])

        for page in range(1, pagenum + 1, 1):
            comment_url = Dm5Commnets.COMMENT_URL.format(docurl=docurl,
                                                         page=page)
            self.storeurl(comment_url, params.originalurl,
                          Dm5Commnets.STEP_3_BBS)
def download(urlfilepath):
    whoami = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, const.SPIDER_LOCAL_WHOAMI)
    donepath = SpiderConfigure.getconfig(const.SPIDER_LOCAL_DOMAIN, whoami + constant.DOWNLOADER_DONE_PATH)
    FileUtility.mkdirs(donepath)  
    filename = os.path.basename(urlfilepath)
    writeTmpfile = os.path.join(donepath, filename+'.temp')
    writefile = os.path.join(donepath, filename + '.txt.' + str(int(time.time())) + '.done')
    if os.path.exists(writeTmpfile):
        os.remove(writeTmpfile)
    if os.path.exists(writefile):
        os.remove(writefile) 
    httpsflag = False
    if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
        readlines = FileUtility.readlines(urlfilepath)
        for line in readlines:
            if line.strip().startswith('https'):
                httpsflag = True
                break
    #创建空文件
    with open(writeTmpfile,'a+') as filetemp:
        filetemp.write('')    
    if urlfilepath.endswith(constant.WEBKIT_FILE_SUFFIX) or httpsflag:
        downWebkit(urlfilepath, writeTmpfile)
    elif urlfilepath.endswith(constant.POST_FILE_SUFFIX):
        downPost(urlfilepath, writeTmpfile)
    else:
        downGet(urlfilepath, writeTmpfile)
    if os.path.exists(writeTmpfile):
        os.rename(writeTmpfile, writefile)
        Logger.getlogging().debug('DoneFile Download Success: {f}'.format(f=writefile))
    FileUtility.remove(urlfilepath)       
Example #20
0
    def step2(self, params):
        try:
            Logger.getlogging().info("Kr36Comments.STEP_2")
            # 将STEP_1中的cid传下来
            cid = params.customized['cid']

            jsoncontent = json.loads(params.content)
            comments_count = jsoncontent['data']['total_items']
            page_count = jsoncontent['data']['total_pages']
            # 判断增量
            cmtnum = CMTStorage.getcount(params.originalurl)
            if cmtnum >= comments_count:
                return

            #最多只取十页评论
            # page_num = int(math.ceil(float(comments_count - cmtnum) / self.page_size))
            if page_count >= self.maxpages:
                page_count = self.maxpages
            lasttime = CMTStorage.getlastpublish(params.originalurl,True)

            for page in range(1, page_count+1, 1):
                commentinfo_url = Kr36Comments.COMMENT_URL.format(cid, self.page_size, page)
                self.storeurl(commentinfo_url, params.originalurl, Kr36Comments.STEP_3,lasttime)
        except:
            Logger.printexception()
Example #21
0
    def process(self, params):
        # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL
        if params.step == LaohuS2Query.LAOHU_S2QUERY_FIRST_PAGE:
            # 获得首页url参数
            KEY = params.customized['KEY']
            time = params.customized['time']
            #获取总页数
            xparser = XPathUtility(params.content)
            pageCounts = xparser.getlist('//*[@id="main"]/div[2]/span')
            if len(pageCounts) > 0:
                page = str(pageCounts[0]).split('/')[1]

                #获取第一页的搜索结果
                self.pageprocess(params)

                if int(page) > 1:
                    if int(page) >= self.maxpages:
                        page = self.maxpages
                    querylist = []
                    # 根据总页数,获取query列表(第一页的数据已经获取到了,从第二页开始拼出获取的url)
                    for pages in range(2, int(page) + 1, 1):
                        url = LaohuS2Query.LAOHU_QUERY_TEMPLATE.format(
                            KEY=KEY, pn=pages, time=time)
                        querylist.append(url)
                    self.__storeqeuryurllist__(
                        querylist, LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE,
                        {'KEY': KEY})

            else:
                Logger.getlogging().debug('抱歉,没有找到与' + ' ' + KEY + ' ' +
                                          '相关的帖子')

        # 从查询页面中获取视频URL
        elif params.step == LaohuS2Query.LAOHU_S2QUERY_EACH_PAGE:
            self.pageprocess(params)
Example #22
0
    def process(self, params):
        # 从搜索首页面中获取的搜索结果数量,生成搜索页面URL
        if params.step == LeQuery.LETV_S2QUERY_FIRST_PAGE:
            # 获得首页url参数
            q = params.customized['query']
            content = json.loads(params.content)
            count = content['video_count']
            if int(count) == 0:
                Logger.getlogging().info('count:{count}'.format(count=count))
                return

            # 所有循环列表
            querylist = []
            if count > 510:
                totalpage = 17
            else:
                totalpage = int(
                    math.ceil(float(count) / LeQuery.DEFAULT_PAGE_SIZE))

            # 获取第一页的搜索结果
            self.gets2url(params)
            if totalpage > self.maxpages:
                totalpage = self.maxpages
            # 根据总页数,获取query列表(第一页已经获取到了,从第二页开始获取)
            for page in range(2, totalpage + 1, 1):
                url = LeQuery.LETV_QUERY_TEMPLATE.format(
                    pn=page, q=params.customized['query'])
                querylist.append(url)
            self.__storeqeuryurllist__(querylist,
                                       LeQuery.LETV_S2QUERY_EACH_PAGE,
                                       {'query': q})

        # 从查询页面中获取视频URL
        elif params.step == LeQuery.LETV_S2QUERY_EACH_PAGE:
            self.gets2url(params)
    def step3bbs(self, params):
        Logger.getlogging().info("Tmtpostcommnets.STEP_3")
        # Step3: 通过Step2设置的url,得到所有评论,抽取评论
        commentsinfo = json.loads(params.content)
        comments = []

        #for index in range(0, int(len(commentsinfo['data'])), 1):
        ## 提取时间
        #cmti = CommentInfo()
        #cmti.content = commentsinfo['data'][index]['comment']
        #tm = TimeUtility.getuniformtime(commentsinfo['data'][index]['time_created'], u'%Y-%m-%d %H:%M')
        #if URLStorage.storeupdatetime(params.originalurl, tm):
        #comments.append(cmti)

        jsondata = commentsinfo['data']
        if not jsondata:
            return
        for data in jsondata:
            cmti = CommentInfo()
            cmti.content = data['comment']
            tm = gettimeutil.getuniformtime(data['time_created'])
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)

        # 保存获取的评论
        if len(comments) > 0:
            self.commentstorage.store(params.originalurl, comments)
Example #24
0
    def step2(self, params):
        """"""
        try:
            key = params.customized['key']
            soup = BeautifulSoup(params.content, 'html5lib')
            #print soup
            #searchListOne = soup.select('.searchListOne > ul')
            searchListOne = soup.select('.searchListOne > ul > li > div')
            if not searchListOne:
                Logger.getlogging().warning('{}:40000 No urllist'.format(
                    params.originalurl))
                return
            lis = soup.select(
                '.searchListOne > ul > li'
            )[:-1]  #最后一个<li id=search_msg style="display:none"></li>,过滤掉
            urllist = []
            for li in lis:
                url = li.select_one('h3 > a').get('href')
                #print '*********',url
                tm = li.select('.source > span')[0].get_text()
                tm = getuniformtime(tm)
                now = getuniformtime(str(time.time()))
                cmt_num = li.select('.source > span')[-1].get_text()

                title = li.select_one('h3').get_text()
                if Common.checktitle(Common.urldec(key), title):
                    if compareNow(tm, self.querylastdays):
                        urllist.append(url)
            if len(urllist) > 0:
                self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_TIEBA)
        except:
            #traceback.print_exc()
            Logger.printexception()
            Logger.getlogging().error(
                'extract comment error from {site}'.format(site=params.url))
 def pageprocess(self, params):
     # Step3:根据返回的html,通过xpath://*[@class="scout_anim_titletext"],获得检索结果的标题
     # //*[@class="scout_anim_title"]/div/a/@href,获得检索结果的url
     #Logger.getlogging().debug(params.content)
     indexstart = params.content.find('(')
     indexstop = params.content.rfind(')')
     if indexstart > -1 and indexstop > -1:
         jsonvalue = params.content[indexstart + 1:indexstop]
         jsondata = json.loads(jsonvalue)
         info = params.customized['query']
         soup = BeautifulSoup(jsondata['content'], 'html5lib')
         uls = soup.select('.scout_anim_odd > .scout_anim_odd_ul')
         if uls:
             for ul in uls:
                 #titles = ul.select_one('.scout_anim_titletext')
                 titles = ul.select_one('.scout_anim_titletext').get_text()
                 Logger.getlogging().debug(titles)
                 # if info not in titles:
                 if not Common.checktitle(info, titles):
                     return
                 content = ul.select('.scout_anim_content > div > ul > li')
                 if content:
                     if len(content) > 3:
                         content = content[-3:]
                     urllist = [
                         'https://donghua.dmzj.com' +
                         item.find('a').get('href') for item in content
                     ]
                     self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def step2(self, params):
     """"""
     q = params.customized['query']
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('.videobox')
     if not divs:
         Logger.log(params.originalurl,
                    constant.ERRORCODE_SITE_NOGET_COMMNETS)
         return
     urllist = []
     for div in divs:
         title = div.select_one('.title').get_text()
         #print title
         tm = getuniformtime(div.select_one('.date').get_text())
         url = div.select_one('.title > a').get('href')
         Logger.getlogging().debug(title)
         if not compareNow(tm, self.querylastdays):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTIME)
             continue
         if not Common.checktitle(Common.urldec(q), title):
             Logger.log(url, constant.ERRORCODE_WARNNING_NOMATCHTITLE)
             continue
         urllist.append(url)
         #获取最终url列表
     if len(urllist) > 0:
         self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)
 def upload(self, upfile):
     cmd = self.UPLOADCMD.format(appId=self.appid,
                                 token=self.token,
                                 times=self.times,
                                 path=upfile)
     if constant.DEBUG_FLAG == constant.DEBUG_FLAG_WINDOWS:
         Logger.getlogging().debug(cmd)
         self.jobid = 'test'
         return True
     exedata = self.execute(cmd)
     code = exedata.get('code', 0)
     if int(code) == 1:
         self.jobid = exedata['jobId']
         return True
     secs = 5
     for count in range(0, self.RETRYTIMES):
         time.sleep(secs)
         secs *= 2
         exedata = self.execute(cmd)
         code = exedata.get('code', 0)
         if int(code) == 1:
             self.jobid = jsondata['jobId']
             return True
     else:
         param = NotifyParam()
         param.code = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED
         param.message = NotifyParam.SPIDER_NOTIFY_UPLOAD_FAILED_MESSAGE_FORMAT.format(
             file=upfile, taskid=self.appid)
         SpiderNotify.notify(param)
         return False
Example #28
0
    def common_step3(self, proparam):
        # 网易非云阅读处理
        try:
            commentsinfo = json.loads(proparam.content)
        except:
            Logger.getlogging().warning(
                '{url}:30000 No comments'.format(url=proparam.originalurl))
            return
        #commentsinfo = json.loads(proparam.content)
        comments = []

        # 获取评论
        key_comments = 'comments'
        if key_comments in commentsinfo:
            for key in commentsinfo[key_comments].keys():
                try:
                    nickname = commentsinfo[key_comments][key]['user'][
                        'nickname']
                except:
                    nickname = 'anonymous'
                if CMTStorage.exist(
                        proparam.originalurl,
                        commentsinfo[key_comments][key]['content'],
                        commentsinfo[key_comments][key]['createTime'],
                        nickname):
                    CMTStorage.storecmt(
                        proparam.originalurl,
                        commentsinfo[key_comments][key]['content'],
                        commentsinfo[key_comments][key]['createTime'],
                        nickname)
                else:
                    break
Example #29
0
    def process(self, params):
        Logger.getlogging().info(params.url)
        try:
            if params.step is KanKanComments.STEP_1:
                # 获取播放量(不是所有的视频都有播放量)
                self.setclicknum(params)
                if self.r.match(self.TYPE1, params.originalurl):
                    # Step1: 通过原始url得到moveid,得到获取评论的首页url。
                    movieid = self.r.parse(self.TYPE1, params.url)[0]
                    Logger.getlogging().debug(movieid)
                    commentinfo_url = KanKanComments.COMMENTS_URL1.format(
                        movieid=movieid, page=1, perpage=self.PERPAGE)
                    self.storeurl(commentinfo_url, params.originalurl,
                                  KanKanComments.STEP_2, {'movieid': movieid})
                elif self.r.match(self.TYPE2, params.originalurl):
                    # Step1: 通过原始url得到type和sid,得到获取评论的首页url
                    self.substep1(params, self.TYPE2)
                elif self.r.match(self.TYPE3, params.originalurl):
                    # Step1: 通过原始url得到type和sid,vchannel得到获取评论的首页url
                    self.substep1(params, self.TYPE3)

            elif params.step == KanKanComments.STEP_2:
                self.step2(params)
            elif params.step == KanKanComments.STEP_3:
                self.step3(params)
            elif params.step == KanKanComments.STEP_CLICK:
                self.step_click(params)
        except:
            Logger.printexception()
 def storecmt(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     if not CMTStorage.exist(url, content, pubdate, user):
         Logger.getlogging().debug(
             'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'.
             format(url=url, content=content, pubdate=pubdate, user=user))
         id = CMTStorage.getid(url, content, pubdate, user)
         data = {
             SQLDAO.SPIDER_TABLE_COMMENTS_ID:
             id,
             SQLDAO.SPIDER_TABLE_COMMENTS_URL:
             url,
             SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE:
             pubdate,
             SQLDAO.SPIDER_TABLE_COMMENTS_USER:
             user,
             SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT:
             content,
             SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             SQLDAO.SPIDER_TABLE_COMMENTS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))