Esempio n. 1
0
 def analysis(self, line, post=False):
     param = ProcessParam()
     js = json.loads(line)
     param.crawler_time = TimeUtility.getuniformtime2(js['crawler_time'])
     param.url = Common.urldec(js['foundin'])
     param.content = js['html']
     if post:
         param.data = js['data']
     if js['html'][:3] == constant.GZIP_CODE:
         param.content = zlib.decompress(param.content, 16 + zlib.MAX_WBITS)
     # decode
     content = Common.urldec(param.content)
     charset = RegexUtility.getid('charset', content)
     content = Common.trydecode(content, charset)
     param.content = content
     if 'property' in js:
         for property in js['property']:
             if not property.has_key('result'):
                 continue
             if property['property_name'] == u'page_body':
                 param.page_body = Common.trydecode(Common.urldec(property['result'][0]['text']),
                                                    constant.CHARSET_GBK)
             elif property['property_name'] == u'page_title':
                 param.page_title = Common.trydecode(Common.urldec(property['result'][0]['text']),
                                                     constant.CHARSET_GBK)
             elif property['property_name'] == u'html_time':
                 param.html_time = TimeUtility.getuniformtime2(property['result'][0]['text'])
     return param
Esempio n. 2
0
    def process(self, params):
        try:
            if params.step is AllComments.STEP_1:
                aid = re.findall("\d+", params.url.split("/")[-1])[0]
                aid_url = AllComments.AID_URL % (aid)
                self.storeurl(aid_url, params.originalurl, AllComments.STEP_2,
                              {'aid': aid})
            elif params.step is AllComments.STEP_2:
                cms_id = re.findall('appidArr \= \[\"cms\|(.+?)",',
                                    str(params.content))[0]
                cms_url = AllComments.KEYID_URL % (
                    cms_id, params.customized['aid'], params.originalurl)
                self.storeurl(cms_url, params.originalurl, AllComments.STEP_3,
                              {
                                  'aid': params.customized['aid'],
                                  'cmsid': cms_id
                              })
            elif params.step is AllComments.STEP_3:
                comments = json.loads(params.content)
                sid = comments['data']['_id']
                comment_url = AllComments.COMMENTS_URL % (
                    sid, '1', params.customized['cmsid'])
                self.storeurl(comment_url, params.originalurl,
                              AllComments.STEP_4, {
                                  'sid': sid,
                                  'page': '1',
                                  'cmsid': params.customized['cmsid']
                              })
            elif params.step is AllComments.STEP_4:
                comments = json.loads(params.content)
                try:
                    comment = []
                    index = 0
                    for index in range(0, len(comments['data'])):
                        ctime = TimeUtility.getuniformtime2(
                            comments['data'][index]['ctime'])
                        if URLStorage.storeupdatetime(params.originalurl,
                                                      str(ctime)):
                            cmti = CommentInfo()
                            cmti.content = comments['data'][index]['content']
                            comment.append(cmti)
                    self.commentstorage.store(params.originalurl, comment)
                    comment_url = AllComments.COMMENTS_URL % (
                        params.customized['sid'],
                        str(int(params.customized['page']) + 1),
                        params.customized['cmsid'])
                    self.storeurl(
                        comment_url, params.originalurl, AllComments.STEP_4, {
                            'sid': params.customized['sid'],
                            'page': str(int(params.customized['page']) + 1),
                            'cmsid': params.customized['cmsid']
                        })
                except:
                    return

        except Exception, e:
            traceback.print_exc()
            Logger.getlogging().error(e.message)
Esempio n. 3
0
 def parseinfofromjson(self, jsondata):
     # 页面爬取时间
     self.crawler_time = TimeUtility.getuniformtime2(
         int(jsondata['crawler_time']))
     # get title/body
     properties = jsondata['property']
     for property in properties:
         if property['property_name'] == 'page_title':
             self.page_title = property['result'][0]['text']
         elif property['property_name'] == 'page_body':
             self.page_body = property['result'][0]['text']
 def processVideo(self, params):
     try:
         if params.step is MofangComments.STEP_1:
             if not self.r.search('data-flag=\"(.*?)\">', params.content):
                 return
             cmsid = self.r.parse('data-flag=\"(.*?)\">', params.content)[0]
             comments_url = MofangComments.COMMENTS_URL % (cmsid, '4')
             self.storeurl(comments_url, params.originalurl,
                           MofangComments.STEP_2, {
                               'cmsid': cmsid,
                               'pagesize': '4'
                           })
         elif params.step is MofangComments.STEP_2:
             comments = json.loads(params.content)
             pagesize = comments['data']['total']
             comments_url = MofangComments.COMMENTS_URL % (
                 params.customized['cmsid'], pagesize)
             self.storeurl(comments_url, params.originalurl,
                           MofangComments.STEP_3, {
                               'cmsid': params.customized['cmsid'],
                               'pagesize': pagesize
                           })
         elif params.step is MofangComments.STEP_3:
             comments = json.loads(params.content)
             if params.customized['pagesize'] <> '0':
                 pcontent = []
                 ptime = []
                 for key in range(0, int(params.customized['pagesize'])):
                     ptime.append(
                         TimeUtility.getuniformtime2(
                             comments['data']['list'][key]['create_time']))
                     pcontent.append(
                         comments['data']['list'][key]['html_content'])
                 if ptime <> []:
                     index = 0
                     comments = []
                     complete = False
                     for comment in pcontent:
                         cmti = CommentInfo()
                         cmti.content = comment
                         #只判断时间段为新增时间段的情况下,才写入增量list中
                         if URLStorage.storeupdatetime(
                                 params.originalurl, str(ptime[index])):
                             comments.append(cmti)
                             index += 1
                         else:
                             #更新数据库时间
                             complete = True
                             break
                     self.commentstorage.store(params.originalurl, comments)
     except Exception, e:
         Logger.printexception()
            info.clicknum -= 10
        else:
            info.clicknum = 0
        if info.cmtnum > 10:
            info.cmtnum -= 10
        else:
            info.cmtnum = 0
        if info.votenum > 10:
            info.votenum -= 10
        else:
            info.votenum = 0
        if info.fansnum > 10:
            info.fansnum -= 10
        else:
            info.fansnum = 0
        if info.realnum > 10:
            info.realnum -= 10
        else:
            info.realnum = 0
        if info.updatetime > TimeUtility.getuniformtime2(0):
            if len(info.updatetime) != 19:
                info.updatetime = getuniformtime(info.updatetime)
            dt = datetime.datetime.strptime(info.updatetime,
                                            TimeUtility.TIME_FORMAT_DEFAULT)
            info.updatetime = TimeUtility.getuniformtime(
                str(dt - datetime.timedelta(days=int(1))))

        print info.tostring()
        db.put(key, info.tostring())
    db.flush()