def storecmt(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     if not CMTStorage.exist(url, content, pubdate, user):
         Logger.getlogging().debug(
             'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'.
             format(url=url, content=content, pubdate=pubdate, user=user))
         id = CMTStorage.getid(url, content, pubdate, user)
         data = {
             SQLDAO.SPIDER_TABLE_COMMENTS_ID:
             id,
             SQLDAO.SPIDER_TABLE_COMMENTS_URL:
             url,
             SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE:
             pubdate,
             SQLDAO.SPIDER_TABLE_COMMENTS_USER:
             user,
             SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT:
             content,
             SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             SQLDAO.SPIDER_TABLE_COMMENTS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
 def show():
     u'{channel}\t{query}\t{cmtnum}\t{clicknum}\t{fansnum}\t{votenum}\t{publishdate}\t{createdate}\t{url}'
     Logger.getlogging().debug(
         'Now, Results Extract From Database Showing: ')
     Logger.getlogging().debug(
         u'channel\tquery\tcmtnum\tclicknum\tfansnum\tvotenum\tpublishdate\tcreatedate\turl'
     )
     alldata = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_NEWS, {
             SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
             SpiderConfigure.getinstance().starttime()
         })
     for data in alldata:
         dictdata = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, data)
         string = NewsStorage.NEWS_FORMAT.format(
             channel=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL],
             query=dictdata[SQLDAO.SPIDER_TABLE_NEWS_QUERY],
             cmtnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM],
             clicknum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM],
             fansnum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM],
             votenum=dictdata[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM],
             publishdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE],
             createdate=dictdata[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE],
             url=dictdata[SQLDAO.SPIDER_TABLE_NEWS_URL])
         Logger.getlogging().debug(string)
Exemple #3
0
 def updatecommentsflag(self, urllist):
     if urllist:
         urllist = [item.encode(constant.CHARSET_UTF8) for item in urllist]
         sqlf = 'UPDATE {table} SET {key1}=1  WHERE {key1} is null and {url} in {urllist}'
         sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS,
                           key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1,
                           url=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                           urllist=tuple(urllist))
         Logger.getlogging().info('UPDATE comments SET key1=1 where key1 is null')
         SQLDAO.getinstance().execute(sql)      
Exemple #4
0
 def updatenewsflag(self, idlist):
     if idlist:
         idlist = [item.encode(constant.CHARSET_UTF8) for item in idlist]
         sqlf = 'UPDATE {table} SET {key1}=1  WHERE {key1} is null and {id} in {idlist}'
         sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS,
                           key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1,
                           id=SQLDAO.SPIDER_TABLE_NEWS_ID,
                           idlist=tuple(idlist))
         Logger.getlogging().info('UPDATE news SET key1=1 where key1 is null')
         SQLDAO.getinstance().execute(sql)
 def getcount(url, before=False):
     if not before:
         return SQLDAO.getinstance().count(
             SQLDAO.SPIDER_TABLE_COMMENTS,
             {SQLDAO.SPIDER_TABLE_COMMENTS_URL: url})
     else:
         wheref = '{urlkey}=\"{url}\" and {datekey}<={date}'
         where = wheref.format(
             urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
             url=url,
             datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE,
             date=SQLDAO.gettime())
         return SQLDAO.getinstance().count(SQLDAO.SPIDER_TABLE_COMMENTS,
                                           where=where)
Exemple #6
0
 def __init__(self):
     # 下载平台
     SQLDAO.getinstance()
     self.downloader = Downloader()
     self.wdownloader = WDownloader()
     # ETL controller
     self.etl = ETLController()
     self.waitingperiod = int(
         SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                   const.SPIDER_WAITING_PERIOD))
     self.timeout = int(2 * int(
         SpiderConfigure.getconfig(const.SPIDER_EXCEPTION_DOMAIN,
                                   const.SPIDER_WAIT_PLATFORM_TIMEOUT)))
     self.spiderstarttime = int(time.time())
     self.waibutimeout = 2 * 60 * 60
Exemple #7
0
 def dereplicate(self):
     #聚合相关信息后, 执行去重
     #获取本次所有key1=null且channel=201的所有url为urllist
     #遍历urllist中的url,且key1=null,channel=202的所有id为idlist
     #对idlist中的kye1打标记,该条记录不在输出
     idlist = []
     sql = 'SELECT url from news where key1 is null and channel=201'
     sqlf= 'SELECT id  from news where url=\"{url}\" and key1 is null and channel=202'
     results = SQLDAO.getinstance().execute(sql, find=True)    
     for result in results:
         sql2 = sqlf.format(url=result[0])
         results2 = SQLDAO.getinstance().execute(sql2, find=True) 
         if results2:
             Logger.getlogging().info('dereplicated url:\t{url}'.format(url=result[0]))
         for result2 in results2:
             idlist.append(result2[0])
     self.updatenewsflag(idlist)
Exemple #8
0
 def find(query,
          machineflaglist=MACHINEFLAGLIST,
          table=SQLDAO.SPIDER_TABLE_QUERYS):
     wheref = '{querykey}=\"{query}\" and {machikey} in ({machine})'
     where = wheref.format(querykey=SQLDAO.SPIDER_TABLE_QUERYS_QUERY,
                           query=query,
                           machikey=SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG,
                           machine=','.join(machineflaglist))
     return SQLDAO.getinstance().find(table, where, multi=False)
Exemple #9
0
 def aggregate_curcmtnum(self):
     #计算本次未推送过的评论数量
     sqlf = 'SELECT {url},count(*) from {table} where {key1} is null group by {url}'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       url=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                       key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1)
     results = SQLDAO.getinstance().execute(sql, find=True)
     for result in results:
         key = Common.md5(result[0].strip())
         if key not in self.url_curcmtnum_map:
             self.url_curcmtnum_map[key] = int(result[1])
 def storeurl(url):
     id = NewsStorage.getid(url)
     if not NewsStorage.exist(url):
         data = {}
         data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
         data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url
         data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance(
         ).getquery()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance(
              ).getchannel()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance(
              ).starttime()
         data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     SQLDAO.getvaluesfromkeys(data))
 def exist(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     id = CMTStorage.getid(url, content, pubdate, user)
     if id in CMTStorage.__cidset:
         return True
     if SQLDAO.getinstance().exists(SQLDAO.SPIDER_TABLE_COMMENTS,
                                    {SQLDAO.SPIDER_TABLE_COMMENTS_ID: id}):
         CMTStorage.__cidset.add(id)
         return True
     return False
Exemple #12
0
 def aggregate_beforenewsnum(self):
     #计算url本次之前已经推送过的次数
     #如果key1标记为1,则表示该url对应的news id已经推送过;否则表示未推送过
     sqlf = 'SELECT {url},count(*) from {table} where {key1}=1 group by {url}'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS,
                       url=SQLDAO.SPIDER_TABLE_NEWS_URL,
                       key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1)   
     results = SQLDAO.getinstance().execute(sql, find=True)
     for result in results:
         key = Common.md5(result[0].strip())
         if key not in self.url_beforenewsnum_map:
             self.url_beforenewsnum_map[key] = int(result[1])         
    def seturlinfos(params):
        id = NewsStorage.getid(params.url)
        if NewsStorage.exist(params.url):
            doc = NewsStorage.getdoc(params.url)
            data = {}
            #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
                       TimeUtility.getintformtime(
                           0)) == TimeUtility.getintformtime(0):
                data[
                    SQLDAO.
                    SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                        params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
            SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                        {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                        data)
        else:
            data = {}
            data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type
            data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter(
                params.title)
            if params.type != constant.SPIDER_S2_WEBSITE_VIDEO:
                data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter(
                    params.body)
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime(
                     params.pubtime)
            data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum
            data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum
            data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum
            data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum
            data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()

            data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
            data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url
            data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query
            data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel
            data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime
            data[SQLDAO.
                 SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
            SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                        SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                        SQLDAO.getvaluesfromkeys(data))
Exemple #14
0
 def dumplocalquerys(queryfile=LOCALQUERYPATH,
                     localmachine=LOCALMACHINEFLAG):
     #todaymid = time.mktime(time.strptime(TimeUtility.getcurrentdate(), TimeUtility.DATE_FORMAT_DEFAULT))
     where = {
         SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: localmachine,
         SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
     }
     results = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_QUERYS,
         where,
         keys=[SQLDAO.SPIDER_TABLE_QUERYS_QUERY])
     querys = [''.join(item) for item in results]
     with open(queryfile, 'w+') as fp:
         fp.write('\n'.join(querys))
     return querys
Exemple #15
0
 def aggregate_curcomments(self):
     #汇总本次未推送的评论
     sqlf = 'SELECT {url},{content},{publish} from {table} where {key1} is null'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       url=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                       content=SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT,
                       publish=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE,
                       key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1)
     cmtsresults = SQLDAO.getinstance().execute(sql, find=True)
     for cmtsresult in cmtsresults:
         urlmd5 = Common.md5(cmtsresult[0])
         content = self.strfilter(cmtsresult[1])
         publish = TimeUtility.getinttime(cmtsresult[2])
         if urlmd5 not in self.url_curcmtcontent_map:
             self.url_curcmtcontent_map[urlmd5] = []
         self.url_curcmtcontent_map[urlmd5].append(content + '_' + str(int(publish)))
 def seturlinfo(url, key=None, value=None, data={}):
     id = NewsStorage.getid(url)
     if data:
         SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                     data)
         return
     if SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE == key:
         value = TimeUtility.getuniformtime(value)
     if NewsStorage.exist(url):
         doc = NewsStorage.getdoc(url)
         tempvalue = doc.get(key, '')
         if tempvalue != value:
             data = {
                 key: value,
                 SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE: SQLDAO.gettime()
             }
             SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS,
                                         {SQLDAO.SPIDER_TABLE_NEWS_ID: id},
                                         data)
     else:
         data = {}
         data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id
         data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url
         data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance(
         ).getquery()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance(
              ).getchannel()
         data[SQLDAO.
              SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG
         data[SQLDAO.
              SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance(
              ).starttime()
         data[key] = value
         data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime()
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     SQLDAO.getvaluesfromkeys(data))
Exemple #17
0
 def dumplocalquerys_tieba(queryfile=LOCALQUERYPATH,
                           localmachine=LOCALMACHINEFLAG):
     where = {
         SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: localmachine,
         SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
     }
     results = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
         where,
         keys=[
             SQLDAO.SPIDER_TABLE_QUERYS_QUERY,
             SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL
         ])
     querys = ['\t'.join(item) for item in results]
     with open(queryfile, 'w+') as fp:
         fp.write('\n'.join(querys))
     return querys
Exemple #18
0
 def storetiebaquery(self,
                     query,
                     queryurl,
                     machineflaglist=MACHINEFLAGLIST_TIEBA):
     #查询query是否存在,如果存在则更新当前updatetime
     #                  如果不存在则查找具有query数量最小的机器,进行query存储
     query = query.strip()
     queryurl = queryurl.strip()
     result = QueryStorage.find(query,
                                machineflaglist,
                                table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA)
     if result:
         resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
                                         result)
         machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG]
         id = QueryStorage.getid(query, machine)
         SQLDAO.getinstance().update(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, {
                 SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
                 SpiderConfigure.getinstance().starttime(),
                 SQLDAO.SPIDER_TABLE_QUERYS_VALID:
                 1
             })
     else:
         machine = min(self.querystorage_tieba.iteritems(),
                       key=lambda x: x[1])[0]
         data = {
             SQLDAO.SPIDER_TABLE_QUERYS_ID:
             QueryStorage.getid(query, machine),
             SQLDAO.SPIDER_TABLE_QUERYS_QUERY:
             query,
             SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE:
             SpiderConfigure.getinstance().starttime(),
             SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE:
             SpiderConfigure.getinstance().starttime(),
             SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG:
             machine,
             SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL:
             queryurl,
             SQLDAO.SPIDER_TABLE_QUERYS_VALID:
             1
         }
         SQLDAO.getinstance().insert(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             SQLDAO.SPIDER_TABLE_QUERYS_KEYS,
             SQLDAO.getvaluesfromkeys(data,
                                      SQLDAO.SPIDER_TABLE_QUERYS_KEYS))
     #对各machine的实时存储记录
     self.querystorage_tieba[machine] = self.querystorage_tieba.get(
         machine, 0) + 1
 def writetofile(filename, cond={}):
     Logger.getlogging().debug(
         'Now {t}, Starting Output Comments To {f}'.format(t=int(
             time.time()),
                                                           f=filename))
     for doc in SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_COMMENTS,
                                          cond):
         url = doc[SQLDAO.SPIDER_TABLE_COMMENTS_URL]
         fstring = CMTStorage.COMMENTS_FORMAT.format(
             channel=ChannelDao.getchannel(url),
             content=doc[SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT],
             cmtnum=CMTStorage.getcount(url),
             publishdate=doc[SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE],
             user=doc[SQLDAO.SPIDER_TABLE_COMMENTS_USER],
             url=doc[SQLDAO.SPIDER_TABLE_COMMENTS_URL],
             title=NewsStorage.gettitle(url))
         FileUtility.writeline(filename, fstring.encode(CHARSET_UTF8))
     FileUtility.flush()
     Logger.getlogging().debug(
         '{t} Comments Finish'.format(t=int(time.time())))
Exemple #20
0
 def updatedb(self):
     #此处注释请勿删除
     #wheref = '{key1}={val1} and \
     #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) > {secs}) or \
     #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) > {secs}))'
     #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
     #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
     #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
     #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
     #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
     #secs =self.period * 24*60*60
     #)
     where = {
         SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
         ETLController.LOCALMACHINEFLAG
     }
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where)
     colddata = []
     for result in results:
         data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         try:
             publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
             createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
             if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate > self.period * 24*60*60) or \
                (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) > self.period * 24*60*60):
                 id = data[SQLDAO.SPIDER_TABLE_NEWS_ID]
                 colddata.append(result)
                 SQLDAO.getinstance().delete(
                     SQLDAO.SPIDER_TABLE_NEWS,
                     {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
         except:
             Logger.printexception()
             Logger.log(data[SQLDAO.SPIDER_TABLE_NEWS_URL],
                        constant.ERRORCODE_WARNNING_OTHERS)
     if colddata:
         SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS_COLD,
                                     SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                     colddata,
                                     mutli=True)
 def getlastpublish(url, before=True):
     if not before:
         wheref = '{urlkey}=\"{url}\"'
         where = wheref.format(urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                               url=url)
     else:
         wheref = '{urlkey}=\"{url}\" and {datekey} < {date}'
         where = wheref.format(
             urlkey=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
             url=url,
             datekey=SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE,
             date=SpiderConfigure.getinstance().starttime())
     sqlf = 'SELECT MAX({key}) FROM {table} WHERE {where}'
     sql = sqlf.format(key=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE,
                       table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       where=where)
     results = SQLDAO.getinstance().execute(sql, find=True)
     if results[0][0]:
         return results[0][0]
     return TimeUtility.getintformtime(0)
Exemple #22
0
 def updatedb():
     for matchine in QueryStorage.MACHINEFLAGLIST:
         SQLDAO.getinstance().update(
             SQLDAO.SPIDER_TABLE_QUERYS,
             where={
                 SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine,
                 SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
             },
             update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})
         SQLDAO.getinstance().update(
             SQLDAO.SPIDER_TABLE_QUERYS_TIEBA,
             where={
                 SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine,
                 SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
             },
             update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})
     for matchine in QueryStorage.MACHINEFLAGLIST_WAIBU:
         SQLDAO.getinstance().update(
             SQLDAO.SPIDER_TABLE_QUERYS,
             where={
                 SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: matchine,
                 SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1
             },
             update={SQLDAO.SPIDER_TABLE_QUERYS_VALID: 0})
Exemple #23
0
 def fileformat(self):
     self.aggregate_beforenewsinfo()
     self.aggregate_beforenewsnum()        
     self.aggregate_curcomments()
     self.aggregate_curcmtnum()
     self.aggregate_beforecmtsnum()
     self.dereplicate()
     urllist = []
     idlist = []
     newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond)
     for result in results:
         doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID]
         url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()            
         try:
             urlmd5 = Common.md5(url)
             channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201')
             title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '')
             body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '')
             commentlist = self.url_curcmtcontent_map.get(urlmd5, [])
             comments = ' '.join(commentlist)
             pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0))
             crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0))
             type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '')
             query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '')
             #评论量增量推送
             #      第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量
             #                     此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)                             
             #      第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)
             cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1)
             if cmtkey1flag <= 0:
                 cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1)
                 if cmtnum < 0:
                     cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             else:
                 cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             #其他增量
             clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1)
             clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum)
             votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1)
             votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum)
             fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1)
             fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum)
             string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel,
                                                            url=url,
                                                            title=self.strfilter(title),
                                                            body=self.strfilter(body),
                                                            comments=comments,
                                                            cmtnum=cmtnum,
                                                            clicknum=clicknum,
                                                            votenum=votenum,
                                                            fansnum=fansnum,
                                                            pubtime=TimeUtility.getinttime(pubtime),
                                                            crawlertime=crawlertime,
                                                            type=type,
                                                            query=self.strfilter(query))   
             Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8))
             if not title:
                 FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) 
             else:
                 FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8))  
       
             if id not in idlist:
                 idlist.append(id)
             if title and commentlist:
                 if url not in urllist:
                     urllist.append(url)
         except:
             Logger.getlogging().error(str(result))
             Logger.printexception()
     #已经提取过,则变更key1标记为1
     self.updatenewsflag(idlist)
     self.updatecommentsflag(urllist)
Exemple #24
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1
Exemple #25
0
 def aggregate_beforenewsinfo(self):
     #如何提取已取过url的最大值
     #1.首先值是有效的,并存储对应的值的抓取时间
     #2.其次按抓取时间排序,取最大抓取时间对应的值
     sqlf = 'SELECT {url},{createtime},{cmtnum}, {clicknum},{votenum},{fansnum} from {table} where  {key1}=1'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS,
                       url=SQLDAO.SPIDER_TABLE_NEWS_URL,
                       createtime=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
                       cmtnum=SQLDAO.SPIDER_TABLE_NEWS_CMTNUM,
                       clicknum=SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM,
                       votenum=SQLDAO.SPIDER_TABLE_NEWS_VOTENUM,
                       fansnum=SQLDAO.SPIDER_TABLE_NEWS_FANSNUM,
                       key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().execute(sql, find=True)
     cmtnumlist = {}
     clicknumlist = {}
     votenumlist = {}
     fansnumlist = {}
     for result in results:
         url = result[0].strip()
         urlmd5 = Common.md5(url)
         createtime = result[1]
         cmtnum = result[2]
         clicknum = result[3]
         votenum = result[4]
         fansnum = result[5]
         if urlmd5 not in cmtnumlist:
             cmtnumlist[urlmd5] = {}
         if urlmd5 not in clicknumlist:
             clicknumlist[urlmd5] = {}
         if urlmd5 not in votenumlist:
             votenumlist[urlmd5] = {}
         if urlmd5 not in fansnumlist:
             fansnumlist[urlmd5] = {}
         #存储有效的值(>0)及对应的抓取时间
         if cmtnum > 0:
             if cmtnumlist[urlmd5].get(str(createtime), 0) <= cmtnum:
                 cmtnumlist[urlmd5][str(createtime)] = cmtnum
         if clicknum > 0:
             if clicknumlist[urlmd5].get(str(createtime), 0) <= clicknum:
                 clicknumlist[urlmd5][str(createtime)] = clicknum
         if votenum > 0:
             if votenumlist[urlmd5].get(str(createtime), 0) <= votenum:
                 votenumlist[urlmd5][str(createtime)] = votenum            
         if fansnum > 0:
             if fansnumlist[urlmd5].get(str(createtime), 0) <= fansnum:
                 fansnumlist[urlmd5][str(createtime)] = fansnum 
     for urlmd5, value in cmtnumlist.iteritems():
         if not value:
             continue
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM][urlmd5] = value[max(value)]
     for urlmd5, value in clicknumlist.iteritems():
         if not value:
             continue            
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM][urlmd5] = value[max(value)]
     for urlmd5, value in votenumlist.iteritems():
         if not value:
             continue            
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM][urlmd5] = value[max(value)]   
     for urlmd5, value in fansnumlist.iteritems():
         if not value:
             continue            
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM][urlmd5] = value[max(value)]      
 def getdoc(url):
     value = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_NEWS,
         {SQLDAO.SPIDER_TABLE_NEWS_ID: NewsStorage.getid(url)},
         multi=False)
     return SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, value)
 def getcount(url):
     id = NewsStorage.getid(url)
     return SQLDAO.getinstance().count(SQLDAO.SPIDER_TABLE_NEWS,
                                       {SQLDAO.SPIDER_TABLE_NEWS_ID: id})
 def exist_cold(url):
     results = SQLDAO.getinstance().find(
         SQLDAO.SPIDER_TABLE_NEWS_COLD, {SQLDAO.SPIDER_TABLE_NEWS_URL: url})
     if results:
         return True
     return False