Exemple #1
0
class ToutiaoPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = 'bb_toutiao_sources'
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Exemple #2
0
class ToutiaoPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = 'bb_toutiao_sources'
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat : %s' % item)
            return True

        public_time = int(time.time())
        create_time = int(time.time())

        for i in xrange(0, len(item['url'])):
            insertData = {
                'title': item['title'][i],
                'url': item['url'][i],
                'unique_code': toMd5(item['url'][i]),
                'share_num': item['share_num'][i],
                'rss_num': item['rss_num'][i],
                'public_time': public_time,
                'create_time': create_time
            }
            self.db.insert(self.tableName, insertData)

        return True
Exemple #3
0
class RssPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item):

        if not item:
            logging.info('------------page not crawl data ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del (self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Exemple #4
0
class RssPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item):

        if not item:
            logging.info('------------page not crawl data ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        if not OPEN_REDIS_DISTINCT:
            return self.item

        uniqueCodeList = self.item.keys()
        repeatUniqueCode = requstDistinct(uniqueCodeList)
        logging.info('------------distinct before : %s ' % uniqueCodeList)
        for i, unique in enumerate(repeatUniqueCode):
            del(self.item[unique])
        logging.info('------------distinct after : %s ' % self.item.keys())
        return self.item
Exemple #5
0
class CommonCrawlPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Exemple #6
0
class CommonCrawlPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        img_url = json.dumps(item['img_url'])
        description = item['description']
        if not description:
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': rule_id,
            'title': title,
            'description': description,
            'img_url': img_url,
            'public_time': public_time,
            'create_time': create_time
        }
        self.db.insert(self.tableName, insertData)
        return True
Exemple #7
0
class CrawlPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score': item['source_score'],
            'is_sync': '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if (not insertOk) and spider.is_duplicate:
            self.db.update(self.tableName, insertData,
                           "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' %
                         insertData['unique_code'])

        return True
Exemple #8
0
class CrawlPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('--------item is empty : %s' % item)
            return True

        create_time = int(time.time())
        img_url = json.dumps(item['img_url'])
        if (not item['description']) and (not item['content']):
            return True

        title = item['title'].decode('utf8')[0:255].encode('utf8')
        insertData = {
            'source_url': item['source_url'],
            'unique_code': toMd5(item['source_url']),
            'rule_id': item['rule_id'],
            'title': title,
            'description': item['description'],
            'content': item['content'],
            'img_url': img_url,
            'source_score' : item['source_score'],
            'is_sync' : '0',
            'public_time': item['public_time'],
            'create_time': create_time
        }
        insertOk = self.db.insert(self.tableName, insertData)
        if ( not insertOk )and spider.is_duplicate:
            self.db.update(self.tableName, insertData, "unique_code = '" + insertData['unique_code'] + "'")
            logging.info('========update.unique_code : %s' % insertData['unique_code'])

        return True
Exemple #9
0
class XmlFeedPipeline(object):
    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del (insertData[unique])

        return insertData
Exemple #10
0
class XmlFeedPipeline(object):

    def __init__(self):

        config = {'host': db_host, 'user': db_user, 'passwd': db_password}
        database = db_name
        self.db = Mysql(config, database)
        self.tableName = db_table_name
        self.item = None

    def process_item(self, item, spider):

        if not item:
            logging.info('-----------------------list page repeat ')
            return True

        self.item = item
        insertDataList = self.filterAndPackageDgrate()
        for index in insertDataList:
            self.db.insert(self.tableName, insertDataList[index])

        return True

    def filterAndPackageDgrate(self):

        uniqueCodeList = []
        insertData = {}
        item = self.item

        rule_id = item['rule_id']
        public_time = int(time.time())
        create_time = int(time.time())

        for index, title in enumerate(item['title']):

            uniqueCode = toMd5(item['source_url'][index])
            if index < len(item['img_url']) and item['img_url'][index]:
                img_url = json.dumps(item['img_url'][index])
            else:
                img_url = ''

            if index < len(item['description']) and item['description'][index]:
                description = item['description'][index]
            else:
                continue

            title = title.decode('utf8')[0:255].encode('utf8')
            uniqueCodeList.append(uniqueCode)
            insertData[uniqueCode] = {
                'source_url': item['source_url'][index],
                'unique_code': uniqueCode,
                'rule_id': rule_id,
                'title': title,
                'description': description,
                'img_url': img_url,
                'public_time': public_time,
                'create_time': create_time
            }

        if uniqueCodeList and OPEN_REDIS_DISTINCT:
            repeatUniqueCode = requstDistinct(uniqueCodeList)
            for i, unique in enumerate(repeatUniqueCode):
                del(insertData[unique])

        return insertData