def process(self, params):
     Logger.getlogging().info(params.url)
     try:
         if params.step == self.STEP_PAGES:
             self.step1(params)
         elif params.step == self.STEP_CMTS:
             self.step2(params)
         else:
             Logger.getlogging().error(
                 'proparam.step == {step}'.format(step=params.step))
     except:
         Logger.printexception
 def dmzjvideo_step3(self, params):
     soup = BeautifulSoup(params.content, 'html5lib')
     divs = soup.select('#comment_list_div > .online_anim_debate_mr')
     for div in divs:
         try:
             updatetime = div.select_one(
                 '.anim_debate_mr_right_title').get_text()
             content = div.select_one('.anim_debate_mr_right_mr').get_text()
             CMTStorage.storecmt(params.originalurl, content,
                                 TimeUtility.getuniformtime(updatetime), '')
         except:
             Logger.printexception()
 def getcomments_step1(self, params):
     bookId = int(
         self.r.parse('^http://pub\.zongheng\.com/book/(\d+).html$',
                      params.url)[0])
     Logger.getlogging().debug(bookId)
     # commentinfo_url = PubComments.COMMENTS_URL.format(bookId=bookId, pageno=1)
     commentinfo_url = PubComments.COMMENTS_URL
     self.storeposturl(commentinfo_url, params.originalurl,
                       PubComments.STEP_2, {
                           'bookId': bookId,
                           'pageNum': '1'
                       }, {'bookId': bookId})
Beispiel #4
0
 def process(self, params):
     Logger.getlogging().info(params.url)
     try:
         # 初始化内部子类对象
         self.createobject()
         # 论坛评论取得
         if self.r.match('http://bbs.onlylady.com/.*', params.originalurl):
             # self.onlyladyBbs.process(params)
             # bbs获取评论调用共通方法,onlyladyBdsComments已测试通过
             CommenComments(self).process(params)
     except Exception, e:
         traceback.print_exc()
 def process(self, params):
     try:
         if params.step is IqiyiComments.STEP_1:
             self.step1(params)
         elif params.step == IqiyiComments.STEP_2:
             self.step2(params)
         elif params.step == IqiyiComments.STEP_3:
             self.step3(params)
         elif params.step == IqiyiComments.STEP_PLAYCOUNT:
             self.geturlplaycount(params)
     except:
         Logger.printexception()
 def query(self, info):
     if self.post_url not in BBSS2PostQuery.post_urllist:
         Logger.debug('{}:Not in tasking'.format(self.post_url))
         return
     if BBSS2PostQuery.isgbk_posturl(self.post_url):
         info = Common.trydecode(info)
         #info = info.decode('gbk').encode('utf-8')
     BBSS2PostQuery.POST_DATA['srchtxt'] = info
     self.queryinfo = info
     self.__storeqeuryurl__(self.post_url,
                            BBSS2PostQuery.S2QUERY_FIRST_PAGE,
                            BBSS2PostQuery.POST_DATA, {'info': info})
 def ls(self, host, port, username, pwd, lsPath):
     list = []
     # 实例化SSHClient
     client = paramiko.SSHClient()
     # 自动添加策略,保存服务器的主机名和密钥信息
     client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
     # 连接SSH服务端,以用户名和密码进行认证
     try:
         client.connect(host, port, username=username, password=pwd)
     except Exception, e:
         Logger.getlogging().error('ssh连接失败{Exception}:{error}'.format(Exception=Exception, error=e))
         return list
Beispiel #8
0
    def step1(self, params):
        Logger.getlogging().info("Flash8Comments.STEP_1")

        # 1. 根据输入原始url, 拼出评论首页
        docurl = self.r.parse('^http://www\.flash8\.net\/flash\/(\d+)\.shtml',
                              params.originalurl)[0]
        # 评论首页URL
        commentinfo_url = 'http://www.flash8.net/newgbook/list_iframe.aspx?nsort=flash&iid={docurl}&page=1'.format(
            docurl=docurl)
        # 论坛
        self.storeurl(commentinfo_url, params.originalurl,
                      Flash8Comments.STEP_2, {'docurl': docurl})
 def step1(self, params):
     try:
         newsId = self.r.parse('\d{3,}', params.url)[-1]
         comment_url = self.COMMENTS_URL.format(self.pageno, self.page_size,
                                                newsId)
         self.storeurl(comment_url, params.originalurl,
                       self.STEP_COMMENT_FIRST_PAGE, {
                           'newsId': newsId,
                           'pageno': self.pageno
                       })
     except:
         Logger.printexception()
 def process(self, params):
     try:
         if params.step == self.STEP_PAGES:
             self.step1(params)
         elif params.step == self.STEP_2:
             self.step2(params)
         elif params.step == self.STEP_3:
             self.step3(params)
         elif params.step == self.STEP_4:
             self.setplayinfo(params)
     except:
         Logger.printexception()
Beispiel #11
0
 def rename(self, host, port, username, pwd, before, after):
     # 实例化SSHClient
     client = paramiko.SSHClient()
     # 自动添加策略,保存服务器的主机名和密钥信息
     client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
     # 连接SSH服务端,以用户名和密码进行认证
     try:
         client.connect(host, port, username=username, password=pwd)
         # 打开一个Channel并执行命令
         client.exec_command('mv ' + before + ' ' + after)
     except Exception, e:
         Logger.getlogging().error('ssh连接失败{Exception}:{error}'.format(Exception=Exception, error=e))
Beispiel #12
0
 def step3(self, params):
     Logger.getlogging().info("ToutiaoNewsComments.STEP_3")
     # Step3: 通过Step2设置的url,得到所有评论,抽取评论
     jsoncontent = json.loads(params.content)
     if jsoncontent:
         if len(jsoncontent['data']) == 0:
             return
         for item in jsoncontent['data']:
             content = item['comment']['text']
             curtime = TimeUtility.getuniformtime(item['comment']['create_time'])
             if not CMTStorage.exist(params.originalurl, content, curtime, ''):
                 CMTStorage.storecmt(params.originalurl, content, curtime, '')
Beispiel #13
0
 def step2(self, params):     
     # 获取replies
     jsonReplies = json.loads(params.content)['data']['replies']
     for cmt in jsonReplies:
         try:
             cmtContent = cmt['content']['message']
             cmtPubDate = cmt['ctime']
             cmtUser = ''
             # 存储评论
             CMTStorage.storecmt(params.originalurl, cmtContent, cmtPubDate, cmtUser)
         except:
             Logger.printexception()
Beispiel #14
0
def sshdownload(host, port, username, pwd, targetFilePath, localPath):
    Logger.getlogging().info('scp -P {port} {username}@{host}:{file} {path}'.format(port=port, username=username, host=host, file=targetFilePath, path=localPath))
    ssh = SSHConnection(host, port, username, pwd)
    if ssh.connect():
        length = len(targetFilePath.split('/'))
        fileName = targetFilePath.split('/')[length - 1]
        ssh.download(targetFilePath, localPath + fileName + '.tmp')
        ssh.close()
        FileUtility.move(localPath + fileName + '.tmp', localPath + fileName)
        return True
    else:
        return False
Beispiel #15
0
 def step3_club(self, params):
     content = params.content
     data = content[content.index('{'):content.rindex('}') + 1]
     data = json.loads(data)
     for item in data['l']:
         try:
             curtime = item['cd']
             pcontent = item['nr']
             comment = XPathUtility(pcontent).getstring('//p')
             CMTStorage.storecmt(params.originalurl, comment, curtime, '')
         except:
             Logger.printexception()
Beispiel #16
0
    def step1(self, params):
        Logger.getlogging().info("Dm5Commnets.STEP_1")

        # 1. 根据输入原始url, 拼出评论首页
        docurl = self.r.parse('^http://www\.dm5\.com/manhua-(.*)/',
                              params.originalurl)[0]
        # 评论首页URL
        commentinfo_url = 'http://www.dm5.com/manhua-{docurl}'.format(
            docurl=docurl)
        # 论坛
        self.storeurl(commentinfo_url, params.originalurl,
                      Dm5Commnets.STEP_2_BBS, {'docurl': docurl})
Beispiel #17
0
 def process(self, params):
     try:
         if params.step == self.STEP_COUNT:
             self.step0(params)
         elif params.step == self.STEP_PAGES:
             self.step1(params)
         elif params.step == self.STEP_CMTS:
             self.step2(params)
         elif params.step == self.STEP_PLAY:
             self.getclick(params)
     except:
         Logger.printexception()
Beispiel #18
0
 def updatedb(self):
     items = SpiderDao().getall()
     if not items:
         return
     validdate = TimeUtility.getuniformdatebefore(SpiderConfigure.getinstance().getvalidperiod())
     removelist = []
     for key in items.keys():
         info = URLCommentInfo.fromstring(items[key])
         if info.timestamp < validdate:
             Logger.getlogging().debug(items[key])
             removelist.append(key)
     SpiderDao().remove(removelist)
Beispiel #19
0
 def getclick(self, params):
     print params.content.replace('\n', ' ').replace('\r', '')
     pattern1 = '<click>(\d+)</click>'
     pattern2 = '&lt;click&gt;(\d+)&lt;/click&gt;'
     if self.r.search(pattern1, params.content):
         click = self.r.parse(pattern1, params.content)[0]
         NewsStorage.setclicknum(params.originalurl, int(click))
     elif self.r.search(pattern2, params.content):
         click = self.r.parse(pattern2, params.content)[0]
         NewsStorage.setclicknum(params.originalurl, int(click))     
     else:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_XPATHVALUE)
Beispiel #20
0
 def step3_yunqi(self, params):
     # Step3: 通过Step2设置的url,得到所有评论,抽取评论
     # 取得所有评论
     soup = BeautifulSoup(params.content, 'html5lib')
     comments = soup.select('#commentList> li')
     for comment in comments:
         try:
             content = comment.select_one('.textBox').get_text()
             curtime = comment.select_one('.userName > span').get_text()
             CMTStorage.storecmt(params.originalurl, content, curtime, '')
         except:
             Logger.printexception()
 def processVideo(self, params):
     try:
         if params.step is MofangComments.STEP_1:
             if not self.r.search('data-flag=\"(.*?)\">', params.content):
                 return
             cmsid = self.r.parse('data-flag=\"(.*?)\">', params.content)[0]
             comments_url = MofangComments.COMMENTS_URL % (cmsid, '4')
             self.storeurl(comments_url, params.originalurl,
                           MofangComments.STEP_2, {
                               'cmsid': cmsid,
                               'pagesize': '4'
                           })
         elif params.step is MofangComments.STEP_2:
             comments = json.loads(params.content)
             pagesize = comments['data']['total']
             comments_url = MofangComments.COMMENTS_URL % (
                 params.customized['cmsid'], pagesize)
             self.storeurl(comments_url, params.originalurl,
                           MofangComments.STEP_3, {
                               'cmsid': params.customized['cmsid'],
                               'pagesize': pagesize
                           })
         elif params.step is MofangComments.STEP_3:
             comments = json.loads(params.content)
             if params.customized['pagesize'] <> '0':
                 pcontent = []
                 ptime = []
                 for key in range(0, int(params.customized['pagesize'])):
                     ptime.append(
                         TimeUtility.getuniformtime2(
                             comments['data']['list'][key]['create_time']))
                     pcontent.append(
                         comments['data']['list'][key]['html_content'])
                 if ptime <> []:
                     index = 0
                     comments = []
                     complete = False
                     for comment in pcontent:
                         cmti = CommentInfo()
                         cmti.content = comment
                         #只判断时间段为新增时间段的情况下,才写入增量list中
                         if URLStorage.storeupdatetime(
                                 params.originalurl, str(ptime[index])):
                             comments.append(cmti)
                             index += 1
                         else:
                             #更新数据库时间
                             complete = True
                             break
                     self.commentstorage.store(params.originalurl, comments)
     except Exception, e:
         Logger.printexception()
    def step1(self, params):
	if re.search('http://.*\.sohu\.com/', params.originalurl):
	    cmttext = XPathUtility(params.content).getstring('//*[@class="c-num-red"][2]|//*[@id="changyan_parti_unit"]|//*[@class="remark-tit"]')
	    if cmttext:
		try:
		    cmtnum = re.findall('\d+', cmttext)[0]
		except:
		    cmtnum = -1
	    else:
		cmtnum = -1
	    #cmtnum = NewsStorage.getcmtnum(params.originalurl)
	    if int(cmtnum) == -1:
		pass
	    elif int(cmtnum) == 0:
		Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS)
		return
	else:
	    cmttext = XPathUtility(params.content).xpath('//*[@class="prompt-null-w"]')
	    if cmttext:
		Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_COMMNETS)
		return
	liteloadApi = ChangyanComments.liteloadApi
	commentsApi = ChangyanComments.commentsApi
	# 取得client_id
	if re.match('http://\w+\.sohu\.com.*',params.originalurl):
	    client_id = 'cyqemw6s1'
	elif re.match(r'^http://\w+\.(17173|shouyou|yeyou)\.com/.*',params.originalurl):
	    client_id = 'cyqvqDTV5'
	elif re.match(r'^http://sports\.le\.com/.*', params.originalurl):
	    client_id = 'cyrJ22d8v'
	#针对妆点网做特殊处理
	elif re.match(r'^http://\w+\.zdface\.com.*', params.originalurl):
	    client_id = 'cyrJOywnM'
	#http://xx.yzz.cn/xiuba/201609/1017135.shtml
	elif re.match(r'^http://\w+\.yzz\.cn.*', params.originalurl):
	    client_id = 'cyrtYf3sa'
	elif re.match(r'^http://\w+\.178\.com.*', params.originalurl):
	    client_id = 'cysrntF12'
	elif re.match(r'^http://.*\.cyol\.com/.*', params.originalurl):
	    client_id = 'cys3X3zo9'
	else:
	    client_id = self.r.getid('appid', params.content)
	topic_url = urllib.quote_plus(params.originalurl)
	#LITELOAD_URL = 'http://changyan.sohu.com/api/{liteloadApi}/topic/liteload?client_id={client_id}&topic_url={topic_url}&topic_source_id={topic_source_id}'
	topic_source_id = self.r.getid('sid',params.content)
	if not topic_source_id:
	    topic_source_id = self.r.getid('data-widget-sid', params.content)
	comment_url = ChangyanComments.LITELOAD_URL.format(liteloadApi=liteloadApi, client_id=client_id, topic_url=topic_url, topic_source_id=topic_source_id) 
	self.storeurl(comment_url, params.originalurl, ChangyanComments.STEP_2, {'client_id': client_id,
	                                                                         'liteloadApi':liteloadApi, 
	                                                                         'topic_url':topic_url, 
	                                                                         'commentsApi':commentsApi})	
Beispiel #23
0
    def step2_news(self, params):
        objectid = params.customized['objectid']
        channel = params.customized['channel']
        type = params.customized['type']
        clienttype = params.customized['clienttype']
        key = params.customized['key']
        pageno = params.customized['pageno']

        content = params.content
        try:
            data = content[content.index('{'):content.rindex('}') + 1]
        except:
            return
            Logger.printexception()
        data = json.loads(data)
        datalist = data['list']
        if not datalist:
            return
        timelist = []
        for item in datalist:
            curtime = item['createTime']
            content = item['content']
            CMTStorage.storecmt(params.originalurl, content, curtime, '')
            timelist.append(TimeUtility.getuniformtime(curtime))
        curcmtnum = data['cnum']
        if pageno == 1:
            NewsStorage.setcmtnum(params.originalurl, curcmtnum)
        if not self.isnewesttime(params.originalurl, min(timelist)):
            return
        #dbcmtnum = CMTStorage.getcount(params.originalurl, True)
        #pages = int(math.ceil(float(curcmtnum - dbcmtnum) / self.news_pagesize))
        pages = int(math.ceil(float(curcmtnum) / self.news_pagesize))
        if pageno >= self.maxpages or pageno >= pages:
            return
        lastcmtid = data['list'][-1]['id']
        pageno = pageno + 1
        comment_url = self.new_commonurl.format(objectid=objectid,
                                                channel=channel,
                                                type=type,
                                                clienttype=clienttype,
                                                key=key,
                                                pageno=pageno,
                                                lastcmtid=lastcmtid)
        self.storeurl(
            comment_url, params.originalurl, self.STEP_COMMENT_EACH_PAGE, {
                'objectid': objectid,
                'channel': channel,
                'type': type,
                'clienttype': clienttype,
                'key': key,
                'pageno': pageno
            })
 def process(self, params):
     Logger.getlogging().info(params.url)
     try:
         # 初始化内部子类对象
         self.createobject()
         # 论坛评论取得
         if self.r.match('http://bbs.dm123.cn/.*', params.originalurl):
             self.dm123Bbs.process(params)
         # 新闻评论取得
         elif self.r.match('http://www.dm123.cn/.*', params.originalurl):
             self.dm123News.process(params)
     except Exception, e:
         traceback.print_exc()
Beispiel #25
0
    def step3(self, params):
        Logger.getlogging().info("xinhuaComments.STEP_3")
        # Step3: 通过Step1设置的urls,得到所有评论,抽取评论
        comment_json = json.loads(params.content)

        for key in range(0, len(comment_json['contentAll'])):
            curtime = TimeUtility.getuniformtime(
                comment_json['contentAll'][key]['commentTime'])
            content = comment_json['contentAll'][key]['content']
            nick = comment_json['contentAll'][key]['nickName']
            if not CMTStorage.exist(params.originalurl, content, curtime,
                                    nick):
                CMTStorage.storecmt(params.originalurl, content, curtime, nick)
 def process(self, params):
     # 1. 根据输入原始url, 获得子域名
     field = self.r.parse('^http[s]{0,1}://(\w+)\.baidu\.com.*',
                          params.originalurl)[0]
     if not field == 'tieba':
         Logger.log(params.originalurl, constant.ERRORCODE_SITE_NOGET_SITE)
         return
     if params.step is BaiduTiebaComments.BAIDU_STEP1:
         self.getcomments_step1(params)
     elif params.step == BaiduTiebaComments.BAIDU_TIEBA_EACH_PAGE:
         self.getpagecomments_step2(params)
     elif params.step == BaiduTiebaComments.BAIDU_TIEBA_HUIFU_PAGE:
         self.get_comment_reply_step3(params)
 def process(self, params):
     """调用具体的取url列表步骤"""
     try:
         if params.step == DmzjS2Query.S2QUERY_FIRST_PAGE:
             self.getQuery.step2(params)
         if params.step == DmzjS2Query.S2QUERY_EACH_PAGE:
             self.getQuery.pageprocess(params)
         if params.step == BBSS2PostQuery.S2QUERY_FIRST_PAGE:
             self.postQuery.step1(params)
         if params.step == BBSS2PostQuery.S2QUERY_EACH_PAGE:
             self.postQuery.step2(params)
     except:
         Logger.printexception()
Beispiel #28
0
 def process(self, params):
     try:
         if params.step is Kr36Comments.STEP_1:
             self.step1(params)
         elif params.step == Kr36Comments.STEP_2:
             self.step2(params)
         elif params.step == Kr36Comments.STEP_3:
             self.step3(params)
         else:
             Logger.getlogging().error('proparam.step == {step}'.format(step=params.step))
             return
     except Exception,e:
         traceback.print_exc()
Beispiel #29
0
 def step4bbs(self, params):
     Logger.getlogging().info("Dm5Commnets.STEP_4")
     id = params.customized['id']
     # 获取所有的评论url
     hrefs = self.r.parse(ur'/tiezi-\d+-p(\d+)/"', params.content)
     hrefs = list(set(hrefs))
     comment_url = Dm5Commnets.COMMENT_URL_PAGE.format(id=id)
     self.storeurl(comment_url, params.originalurl, Dm5Commnets.STEP_5_BBS)
     for href in hrefs:
         comment_url = Dm5Commnets.COMMENT_URL_PAGE_2.format(id=id,
                                                             page=href)
         self.storeurl(comment_url, params.originalurl,
                       Dm5Commnets.STEP_5_BBS)
    def step1(self, params):
        """"""
        key = params.customized['key']
        querylist = []
        for page in range(1, self.DEFAULT_PAGES + 1):
            url = self.QUERY_TEMPLATE.format(key=key, page=page)

            querylist.append(url)
        if len(querylist) > 0:
            self.__storeqeuryurllist__(querylist, self.EACH, {'key': key})
        else:
            Logger.getlogging().warning(
                '{url}:40000 No results'.format(url=params.originalurl))