Ejemplo n.º 1
0
    def step2(self, params):
        """"""
        print params.content
        try:
            jsondata = json.loads(params.content)
            comments_total = int(jsondata['comments_total'])
            comments_data = jsondata['comments']
        except:
            Logger.getlogging().warning(
                '{url}:30000 No comments'.format(url=params.originalurl))
            return
        #cmtnum = URLStorage.getcmtnum(params.originalurl)
        #if cmtnum >= comments_total:
        #return
        #URLStorage.setcmtnum(params.originalurl, comments_total)

        comments = []
        for comment in comments_data:
            cmti = CommentInfo()
            cmti.content = comment['txtcontent']
            tm = comment['addtime']
            if URLStorage.storeupdatetime(params.originalurl, tm):
                comments.append(cmti)
        if len(comments) > 0:
            # 保存获取的评论
            self.commentstorage.store(params.originalurl, comments)

        self.post_data['p'] = str(int(self.data['p'] + self.page_size))
        self.post_data['t'] = TimeUtility.getuniformdate(tm, '%Y-%m-%d+%H%M%S')
        self.storeposturl(self.post_url, params.originalurl, self.STEP_2,
                          self.post_data)
Ejemplo n.º 2
0
    def getsearchresult(self, params):
        info = params.customized['query']

        xpath = XPathUtility(html=params.content)
        hrefs = xpath.xpath('//li/h3/a/@href')
        titles = xpath.getlist('//li/h3/a')
        pubtimes = xpath.xpath('//li/p')

        today = datetime.datetime.strptime(
            TimeUtility.getcurrentdate(),
            TimeUtility.DATE_FORMAT_DEFAULT).date()

        urllist = []
        for index in range(0, len(titles), 1):
            # 标题中包含指定要查询的关键字
            # if titles[index].find(info) > -1:
            if Common.checktitle(info, titles[index]):
                pubtimestr = TimeUtility.getuniformdate(pubtimes[index].text)
                pubtime = datetime.datetime.strptime(
                    pubtimestr, TimeUtility.DATE_FORMAT_DEFAULT).date()
                inteveral = today - pubtime
                # 时间在指定周期内
                if inteveral.days <= self.querylastdays:
                    urllist.append(hrefs[index])
                else:
                    # 因为是按照时间排序的,第一条时间不满足检索周期的话,后面所有的都不满足。
                    break

        if len(urllist) > 0:
            self.__storeurllist__(urllist, SPIDER_S2_WEBSITE_VIDEO)