Esempio n. 1
0
class LogDao(object):
    def __init__(self, logger, belongTo=''):
        self.connector = Connector()
        self.belongTo = belongTo
        self.logger = logger

    '''
    scrapy_log
        id
        info 具体内容
        level 级别:info warn
        save_time 时间
        belong_to 所属模块
        attach 附加信息
    '''

    def queryAll(self):
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = "select id,info,level,save_time,belong_to,attach from scrapy_log "
        cursor.execute(sql_query)
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def queryPart(self, belong_to):
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = "select id,info,level,save_time,belong_to,attach from scrapy_log where belong_to=%s"
        cursor.execute(sql_query, (belong_to, ))
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def save(self, info, level, belong_to='', attach=''):
        cursor = self.connector.cursor()
        if not cursor:
            return
        self.logger.info(belong_to + info)
        cursor = self.connector.cursor()
        sql_query = 'insert into scrapy_log (info,level,save_time,belong_to,attach) values (%s,%s,%s,%s,%s)'
        save_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time()))
        cursor.execute(sql_query, (info, level, save_time, belong_to, attach))
        cursor.close()
        self.connector.commit()

    def info(self, info, belong_to='', attach=''):
        belong_to = belong_to if belong_to else self.belongTo
        self.save(info, 'info', belong_to, attach)

    def warn(self, info, belong_to='', attach=''):
        belong_to = belong_to if belong_to else self.belongTo
        self.save(info, 'warn', belong_to, attach)
Esempio n. 2
0
class MysqlPipeline(object):
    def __init__(self):
        self.connector = Connector()

    def process_item(self, item, spider):
        cursor = self.connector.cursor()
        if not cursor:
            spider.logDao.info(u'数据库连接不上暂时不存:' + item['title'])
            return item
        if isinstance(item, ContentItem):
            # 如果存在,则不做处理
            spider.logDao.info(u'存微信详情:' + item['title'])
            sql = "insert into weixin_detail (" \
                  "content_txt,title,source_url,post_date,sub_channel,post_user,tags,styles," \
                  "content_html,hash_code,info_type,src_source_id,src_account_id,src_channel,src_ref,wx_account,update_time) " \
                  "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) "

            update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            try:
                cursor.execute(sql, (
                    item['content_txt'],
                    item['title'],
                    item['source_url'],
                    item['post_date'],
                    item['sub_channel'],
                    item['post_user'],
                    item['tags'],
                    item['styles'],
                    item['content_html'],
                    item['hash_code'],
                    item['info_type'],
                    item['src_source_id'],
                    item['src_account_id'],
                    item['src_channel'],
                    item['src_ref'],
                    item['wx_account'],
                    update_time))
                spider.logDao.info(u'存微信详情:' + item['title'] + u'  成功' + u' ' + item['post_date'])
            except Exception as e:
                spider.logDao.warn(str(e))
                spider.logDao.warn(u'存微信详情:' + item['title'] + u'  失败')
        else:
            pass
        cursor.close()
        self.connector.commit()
        return item

    def close_spider(self, spider):
        self.connector.commit()
Esempio n. 3
0
class CheckDao(object):
    def __init__(self):
        self.connector = Connector()
        self.hashList = []  # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取

    def resetHashList(self):
        # 每次重新抓取的时候清除
        self.hashList = []

    def checkExist(self, source_url):
        """
        存在逻辑判断
        :return:
        """
        hash_code = self.getHashCode(source_url)
        cursor = self.connector.cursor()
        if not cursor:
            return True
        sql_query = 'select id from tengxun_detail where hash_code=%s'
        cursor.execute(sql_query, (hash_code, ))
        results = cursor.fetchall()
        cursor.close()
        if results or self.isInHashList(hash_code):
            return True
        else:
            return False

    def isInHashList(self, hash_code):
        return hash_code in self.hashList

    def getHashCode(self, source_url):
        # 具体逻辑
        return EncryptUtil.md5(source_url)

    def getHtml(self, pageIndex):
        """
        获取所有html逻辑
        :return:
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = 'select id,content_html from tengxun_detail group by id limit %s, %s'
        cursor.execute(sql_query, ((pageIndex - 1) * 15, 15))
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def getPostTime(self, pageIndex):
        """
        时间逻辑
        :return:
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = 'select id,post_date from tengxun_detail group by id limit %s, %s'
        cursor.execute(sql_query, ((pageIndex - 1) * 15, 15))
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def updatePostTime(self, id, post_date):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update tengxun_detail set post_date=%s where id=%s"
        cursor.execute(sql_query, (post_date, id))
        cursor.close()
        self.connector.commit()

    def updateStyles(self, id, styles):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update tengxun_detail set styles=%s where id=%s"
        cursor.execute(sql_query, (styles, id))
        cursor.close()
        self.connector.commit()

    def updateHtml(self, id, content_html):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update tengxun_detail set content_html=%s where id=%s"
        cursor.execute(sql_query, (content_html, id))
        cursor.close()
        self.connector.commit()


# checkDao = CheckDao()
# pageIndex = 1
# while True:
#     results = checkDao.getPostTime(pageIndex)
#     if not len(results):
#         print 'end'
#         break
#     print 'pageIndex', pageIndex
#     for result in results:
#         id, post_date = result
#         print 'in: ', id, post_date
#         # post_date = post_date.replace(u'\xa0', u' ')
#         # 处理标签
#         try:
#             # post_date = time.strptime(post_date, "%Y-%m-%d %H:%M:%S")  # time.strftime("%Y-%m-%d %H:%M:%S", )
#             timeArray = time.strptime(str(post_date), "%Y-%m-%d %H:%M:%S")
#             update_time_long = int(time.mktime(timeArray))
#         except Exception as e:
#             print u'出错..........................................................................'
#             pass
#         print 'out: ', id, update_time_long
#         x = time.localtime(update_time_long)
#         print 'out: ', id, time.strftime('%Y-%m-%d %H:%M:%S', x)
#         # checkDao.updatePostTime(id, post_date)
#     pageIndex += 1
Esempio n. 4
0
class CheckDao(object):
    def __init__(self):
        self.connector = Connector()
        self.hashList = []  # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取

    def resetHashList(self):
        # 每次重新抓取的时候清除
        self.hashList = []

    def checkExist(self, source_url):
        """
        存在逻辑判断
        """
        hash_code = self.getHashCode(source_url)
        cursor = self.connector.cursor()
        if not cursor:
            return True
        sql_query = 'select id from sina_detail where hash_code=%s'
        cursor.execute(sql_query, (hash_code, ))
        results = cursor.fetchall()
        cursor.close()
        if results or self.isInHashList(hash_code):
            return True
        else:
            self.hashList.append(hash_code)
            return False

    def isInHashList(self, hash_code):
        return hash_code in self.hashList

    def getHashCode(self, source_url):
        # 具体逻辑
        return EncryptUtil.md5(source_url)

    def getAllHtml(self):
        """
        存在逻辑判断
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = 'select id,styles,content_html from sina_detail'
        cursor.execute(sql_query)
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def update(self, id, styles, content_html):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update sina_detail set content_html=%s,styles=%s where id=%s"
        cursor.execute(sql_query, (content_html, styles, id))
        cursor.close()
        self.connector.commit()

    def getHtml(self, pageIndex):
        """
        存在逻辑判断
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = 'select id,content_html from sina_detail group by id limit %s, %s'
        cursor.execute(sql_query, ((pageIndex - 1) * 15, 15))
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def updateStyles(self, id, styles):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update sina_detail set styles=%s where id=%s"
        cursor.execute(sql_query, (styles, id))
        cursor.close()
        self.connector.commit()

    def updateHtml(self, id, content_html):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update sina_detail set content_html=%s where id=%s"
        cursor.execute(sql_query, (content_html, id))
        cursor.close()
        self.connector.commit()
Esempio n. 5
0
class DataMonitorDao(object):
    def __init__(self):
        self.connector = Connector()

    def getAllHeartBeatTime(self, cursor_out=None):
        """
                :param cursor_out:
                :param types:
                    sina_heartbeat
                    tengxun_heartbeat
                    wangyi_heartbeat
                    weixin_heartbeat
                    weixin_source_heartbeat
                :return:
                得到心跳更新时间
                """
        if not cursor_out:
            cursor = self.connector.cursor()
            if not cursor:
                return
        else:
            cursor = cursor_out

        typeStr = " where type in ('sina_heartbeat','tengxun_heartbeat','wangyi_heartbeat','weixin_heartbeat','weixin_source_heartbeat')"

        sql_query = "select type, update_time from data_monitor " + typeStr
        try:
            cursor.execute(sql_query)
        except Exception as e:
            print e.msg
        results = cursor.fetchall()
        if not cursor_out:
            cursor.close()
            self.connector.close()
        return results or []

    def heartBeat(self, type=''):
        """
        :param type:
            sina_heartbeat
            tengxun_heartbeat
            wangyi_heartbeat
            weixin_heartbeat
            weixin_source_heartbeat
        :return:
        更新跳动时间
        """
        if not type:
            return
        cursor = self.connector.cursor()
        if not cursor:
            return
        info = u'跳动中'
        remark = ''
        if type == 'sina_heartbeat':
            remark = u'新浪_心跳1分钟更新一次'
        elif type == 'tengxun_heartbeat':
            remark = u'腾讯_心跳1分钟更新一次'
        elif type == 'wangyi_heartbeat':
            remark = u'网易_心跳1分钟更新一次'
        elif type == 'weixin_heartbeat':
            remark = u'微信_心跳1分钟更新一次'
        elif type == 'weixin_source_heartbeat':
            remark = u'微信源_心跳1分钟更新一次'
        # 检测是否存在type ,  update_time ,  info , remark, account
        if self.checkExist(cursor_out=cursor, type=type):
            update_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                        time.localtime(time.time()))
            sql_query = "update data_monitor set update_time=%s,info=%s,remark=%s  where type=%s"
            values = (update_time, info, remark, type)
        else:
            sql_query = "INSERT INTO data_monitor ( type, update_time, info, remark) VALUES (" \
                        "%s, %s, %s, %s); "
            update_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                        time.localtime(time.time()))
            values = (type, update_time, info, remark)
        print u'跳一下', update_time
        try:
            cursor.execute(sql_query, values)
        except Exception as e:
            print e.msg
        cursor.close()
        self.connector.commit()
        self.connector.close()

    def updateTotal(self, type, account=''):
        """
        :param account:
        :param type:
            sina_total
            tengxun_total
            wangyi_total
            weixin_total
            weixin_source_total
            weixin_account_total
        :return:
        更新总数
        """
        if not type:
            return

        cursor = self.connector.cursor()
        if not cursor:
            return

        count = self.getTotal(cursor_out=cursor, type=type, account=account)

        info = u'总数:' + str(count)
        remark = ''
        if type == 'sina_total':
            remark = u'新浪条数'
        elif type == 'tengxun_total':
            remark = u'腾讯条数'
        elif type == 'wangyi_total':
            remark = u'网易条数'
        elif type == 'weixin_total':
            remark = u'微信条数'
        elif type == 'weixin_source_total':
            remark = u'微信源条数'
        elif type == 'weixin_account_total':
            remark = u'微信源_对应账号条数'
        # 检测是否存在type ,  update_time ,  info , remark, account
        if self.checkExist(cursor_out=cursor, type=type, account=account):
            update_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                        time.localtime(time.time()))
            sql_query = "update data_monitor set update_time=%s,info=%s,remark=%s  where type=%s and account=%s"
            values = (update_time, info, remark, type, account)
        else:
            sql_query = "INSERT INTO data_monitor ( type, update_time, info, remark, account) VALUES (" \
                        "%s, %s, %s, %s, %s); "
            update_time = time.strftime('%Y-%m-%d %H:%M:%S',
                                        time.localtime(time.time()))
            values = (type, update_time, info, remark, account)
        try:
            cursor.execute(sql_query, values)
        except Exception as e:
            print e.msg
        cursor.close()
        self.connector.commit()

    def checkExist(self, cursor_out=None, type='', account=''):
        """
        :param cursor_out:
        :param account:
        :param type:
            sina_total
            tengxun_total
            wangyi_total
            weixin_total
            weixin_source_total
            weixin_account_total
        :return:
        得到总数
        """
        if not cursor_out:
            cursor = self.connector.cursor()
            if not cursor:
                return
        else:
            cursor = cursor_out

        accountWhere = ''
        if account:
            accountWhere = " and account='%s'" % account

        sql_query = "select * from data_monitor where type=%s" + accountWhere
        try:
            cursor.execute(sql_query, (type, ))
        except Exception as e:
            print e.msg
        results = cursor.fetchall()
        if not cursor_out:
            cursor.close()
        if results:
            return True
        else:
            return False

    def getTotal(self, cursor_out=None, type='', account=''):
        """
        :param cursor_out:
        :param account:
        :param type:
            sina_total
            tengxun_total
            wangyi_total
            weixin_total
            weixin_source_total
            weixin_account_total
        :return:
        得到总数
        """
        if not cursor_out:
            cursor = self.connector.cursor()
            if not cursor:
                return
        else:
            cursor = cursor_out

        table = ''
        accountWhere = ''

        if type == 'sina_total':
            table = 'sina_detail'
        elif type == 'tengxun_total':
            table = 'tengxun_detail'
        elif type == 'wangyi_total':
            table = 'wangyi_detail'
        elif type == 'weixin_total':
            table = 'weixin_detail'
        elif type == 'weixin_source_total':
            table = 'weixin_source'
        elif type == 'weixin_account_total':
            table = 'weixin_detail'
            if not account:
                return 0
            accountWhere = " where wx_account='%s'" % account

        if not table:
            return 0
        sql_query = "select count(*) from " + table + accountWhere
        try:
            cursor.execute(sql_query)
        except Exception as e:
            print e.msg
        results = cursor.fetchone()
        if not cursor_out:
            cursor.close()
        if results:
            (count, ) = results
            return count
        else:
            return 0
Esempio n. 6
0
class WxSourceDao(object):
    def __init__(self):
        self.connector = Connector()
        self.orderType = 'desc'  # 用于判断是升序还是降序

    def queryEnable(self, isRandom=False):
        cursor = self.connector.cursor()
        if not cursor:
            return []
        if self.orderType == "desc":
            self.orderType = "asc"
        else:
            self.orderType = "desc"
        # """
        #     select wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source
        #     where is_enable='1'
        #     and
        #     (
        #         (update_status='last' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(update_time))/60)>20)
        #         or
        #         (update_status='updating' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(update_time))/60)>20)
        #         or update_status='updateFail'
        #         or update_status='none'
        #     )
        #     order by id
        # """
        # 可用的 且( 更新状态为last且时间大于20分钟/ 更新状态为updating且时间大于20分钟/更新状态为updating/更新状态为none)
        sql_query = "select id,wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source " \
                    "where is_enable='1' and ((update_status='last' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(" \
                    "update_time))/60)>40) or (update_status='updating' and round((UNIX_TIMESTAMP(NOW())-UNIX_TIMESTAMP(" \
                    "update_time))/60)>40) or update_status='updateFail' or update_status='none') order by id "+self.orderType
        cursor.execute(sql_query)
        results = cursor.fetchall()
        cursor.close()
        results = results or []
        # print "sources长度", len(results)
        if isRandom and results:
            # 随机排序 防止出现都是请求同一个
            random.shuffle(results)
        return results

    def queryWxUrl_special(self, isRandom=False, wx_accounts=[]):
        """
        获取wxUrl有值,且是有效的
        :return:
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []

        if self.orderType == "desc":
            self.orderType = "asc"
        else:
            self.orderType = "desc"

        accountStr = ''
        if len(wx_accounts):
            accountStr = ','.join(wx_accounts)
            accountStr = "and wx_account in ('%s')"%accountStr

        sql_query = "select id,wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source " \
                    "where is_enable='1' and wx_url !='' "+accountStr+" order by id " + self.orderType
        cursor.execute(sql_query)
        results = cursor.fetchall()
        cursor.close()
        results = results or []
        if isRandom and results:
            # 随机排序 防止出现都是请求同一个
            random.shuffle(results)
        return results

    def queryWxUrl(self, isRandom=False):
        """
        获取wxUrl有值,且是有效的
        :return:
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []

        if self.orderType == "desc":
            self.orderType = "asc"
        else:
            self.orderType = "desc"
        sql_query = "select id,wx_name,wx_account,wx_url,wx_avatar,update_status,is_enable,update_time from weixin_source " \
                    "where is_enable='1' and wx_url !='' order by id " + self.orderType
        cursor.execute(sql_query)
        results = cursor.fetchall()
        cursor.close()
        results = results or []
        if isRandom and results:
            # 随机排序 防止出现都是请求同一个
            random.shuffle(results)
        return results

    def updateStatus(self, wx_account, update_status):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update weixin_source set update_status=%s,update_time=%s where wx_account=%s"
        update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        cursor.execute(sql_query, (update_status, update_time, wx_account))
        cursor.close()
        self.connector.commit()

    def resetUpdating(self):
        """
        重置更新中的的为updateFail,如果出现网络问题,似乎无法回调到而是会一直retry,所以先尝试手动在外部重置
        """
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update weixin_source set update_status='updateFail',update_time=%s where update_status='updating'"
        update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        cursor.execute(sql_query, (update_time,))
        cursor.close()
        self.connector.commit()

    def updateSource(self, wx_account, wx_name, wx_url, update_status):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update weixin_source set wx_name=%s,wx_url=%s,update_status=%s,update_time=%s where " \
                    "wx_account=%s "
        update_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        cursor.execute(sql_query, (wx_name, wx_url, update_status, update_time, wx_account))
        cursor.close()
        self.connector.commit()
Esempio n. 7
0
class CheckDao(object):
    def __init__(self):
        self.connector = Connector()
        self.hashList = []  # 代表此次已经存在的hash,防止同一时间得到相同文章进行抓取

    def resetHashList(self):
        # 每次重新抓取的时候清除
        self.hashList = []

    def checkExist(self, source_url):
        """
        存在逻辑判断
        """
        hash_code = self.getHashCode(source_url)
        cursor = self.connector.cursor()
        if not cursor:
            return True
        sql_query = 'select id from wangyi_detail where hash_code=%s'
        cursor.execute(sql_query, (hash_code,))
        results = cursor.fetchall()
        cursor.close()
        if results or self.isInHashList(hash_code):
            return True
        else:
            self.hashList.append(hash_code)
            return False

    def isInHashList(self, hash_code):
        return hash_code in self.hashList

    def getHashCode(self, source_url):
        # 具体逻辑
        return EncryptUtil.md5(source_url)

    def getHtml(self, pageIndex):
        """
        存在逻辑判断
        """
        cursor = self.connector.cursor()
        if not cursor:
            return []
        sql_query = 'select id,content_html from wangyi_detail group by id limit %s, %s'
        cursor.execute(sql_query, ((pageIndex-1)*15, 15))
        results = cursor.fetchall()
        cursor.close()
        return results or []

    def updateStyles(self, id, styles):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update wangyi_detail set styles=%s where id=%s"
        cursor.execute(sql_query, (styles, id))
        cursor.close()
        self.connector.commit()

    def updateHtml(self, id, content_html):
        cursor = self.connector.cursor()
        if not cursor:
            return
        sql_query = "update wangyi_detail set content_html=%s where id=%s"
        cursor.execute(sql_query, (content_html, id))
        cursor.close()
        self.connector.commit()

# 去除alt和title内容
# checkDao = CheckDao()
# pageIndex = 1
# while True:
#     results = checkDao.getHtml(pageIndex)
#     if not len(results):
#         print 'end'
#         break
#     print 'pageIndex', pageIndex
#     for result in results:
#         id, content_html = result
#         # 处理标签
#         print id
#         selector = Selector(text=content_html)
#         imgAltTitles = selector.xpath('//img/@alt|//img/@title').extract()
#         # 处理提示块img的 alt title, 关注//img/@alt|//img/@title
#         print len(imgAltTitles)
#         for imgAltTitle in imgAltTitles:
#             if imgAltTitle.strip(' '):
#                 print 'here', imgAltTitle
#                 content_html = content_html.replace(imgAltTitle, '')
#         # 更新
#         checkDao.updateHtml(id, content_html)
#     pageIndex += 1