Ejemplo n.º 1
0
class ProxyDataStorePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        now = datetime.datetime.now()
        host = item['host']
        exist_proxy = self.session.query(Proxy).filter(
            Proxy.host == host).first()
        if exist_proxy:
            exist_proxy.available = True
        else:
            proxy = Proxy(host=item['host'],
                          port=item['port'],
                          create_time=now,
                          modify_time=now,
                          available=True)
            if item['protocol'].upper() == ProtocolChoice.HTTP:
                proxy.protocol = ProtocolChoice.HTTP
            else:
                proxy.protocol = ProtocolChoice.HTTPS
            self.session.add(proxy)
        return item

    def close_spider(self, spider):
        try:
            self.session.commit()
        except Exception as e:
            logger.exception(e)
            self.session.rollback()
        finally:
            self.session.close()
Ejemplo n.º 2
0
class ArticleDataStorePipeline(DataStorePipelineBase):
    user_cache_count = 0
    column_cache_count = 0

    def __init__(self):
        super(ArticleDataStorePipeline, self).__init__()
        self.tmp_session = DBSession()

    def close_spider(self, spider):
        try:
            self.session.commit()
            self.session._unique_cache = None
        except Exception as e:
            self.session.rollback()
            logger.exception(e)
        finally:
            self.session.close()
            self.tmp_session.close()

    def get_id(self, model, id_type=1):
        if id_type == 1:
            if self.user_cache_count != 0:
                self.user_cache_count += 1
                return self.user_cache_count
            else:
                obj = self.session.query(model.id).order_by(
                    model.id.desc()).first()
                self.user_cache_count = obj[0] + 1
                return self.user_cache_count
        else:
            if self.column_cache_count != 0:
                self.column_cache_count += 1
                return self.column_cache_count
            else:
                obj = self.session.query(model.id).order_by(
                    model.id.desc()).first()
                self.column_cache_count = obj[0] + 1
                return self.column_cache_count

    #
    # def check_exist(self, md5):
    #     exist = self.session.query(ZHArticle.id).filter(ZHArticle.md5 == md5).first()
    #     return True if exist[0] else False
    #
    def check_column_exist(self, md5):
        exist = self.session.query(ZHColumn).filter(
            ZHColumn.hash == md5).first()
        return exist if exist else False

    def check_user_exist(self, md5):
        exist = self.session.query(ZHUser).filter(ZHUser.slug == md5).first()
        return exist if exist else False

    # def check_tag_exist(self, name):
    #     exist = self.session.query(Tag.id).filter(Tag.name == name).first()
    #     return exist if exist[0] else False

    def create_column(self, item, creator_id=None):
        self.get_now()
        column = ZHColumn(name=item['name'],
                          link=item['link'],
                          hash=item['hash'],
                          slug=item['slug'],
                          description=item['description'],
                          avatar=item['avatar'],
                          creator_id=creator_id,
                          create_time=self.now,
                          modify_time=self.now)
        self.tmp_session.add(column)
        self.tmp_session.commit()
        self.redis.sadd('total_column', column.slug)
        return column

    def create_user(self, item):
        self.get_now()
        user = ZHUser(zuid=item['zuid'],
                      name=item['name'],
                      link=item['link'],
                      hash=item['hash'],
                      slug=item['slug'],
                      description=item['description'],
                      headline=item['headline'],
                      avatar=item['avatar'],
                      create_time=self.now,
                      modify_time=self.now)
        self.tmp_session.add(user)
        self.tmp_session.commit()
        return user

    def fix_image(self, item):
        soup = BeautifulSoup(item['content'], 'lxml')
        finds = soup.find_all('img')
        for itm in finds:
            host_random = random.randint(1, 4)
            itm['src'] = 'https://pic{0}.zhimg.com/{1}'.format(
                host_random, itm['src'])
        if not item['cover']:
            if finds:
                item['cover'] = finds[0]['src']
        finds = soup.find_all('a')
        for itm in finds:
            href = itm.get('href', '')
            res = re.findall(r'/p/([0-9]+)', href)
            if res:
                itm['href'] = 'https://www.wznav.com/article/{0}/'.format(
                    res[0])
        item['content'] = soup.prettify()
        return item

    def create_article(self, item, author_id, column_id):
        item = self.fix_image(item)
        article, new = ZHArticle.as_unique(
            self.session,
            title=item['title'],
            content=item['content'],
            cover=item['cover'],
            md5=item['md5'],
            link=item['link'],
            token=item['token'],
            summary=item['summary'],
            keywords='',
            create_time=datetime.datetime.strptime(item['create_time'],
                                                   '%Y-%m-%dT%H:%M:%S+08:00'),
            modify_time=datetime.datetime.strptime(item['modify_time'],
                                                   '%Y-%m-%dT%H:%M:%S+08:00'),
            author_id=author_id,
            belong_id=column_id)
        return article, new

    def process_item(self, item, spider):
        author = self.check_user_exist(item.author['slug'])
        if not author:
            author = self.create_user(item.author)
        if author.slug == item.creator['slug']:
            creator = author
        else:
            creator = self.check_user_exist(item.creator['slug'])
            if not creator:
                creator = self.create_user(item.creator)
        column = self.check_column_exist(item.column['hash'])
        if not column:
            column = self.create_column(item.column, creator.id)
        article, new = self.create_article(item.article, author.id, column.id)
        if not new:
            raise DropItem('Article item {0} already exist'.format(
                item.article['title']))
        self.periodic_commit()
        return item

    def periodic_commit(self):
        self.count += 1
        if self.count == 10:
            try:
                logger.info('Periodic commit to database')
                self.count = 0
                self.user_cache_count = 0
                self.column_cache_count = 0
                self.session.commit()
                links = ''
                for itm in self.session._unique_cache.values():
                    if not itm.id:
                        generate_keywords_task.apply_async((itm.md5, ),
                                                           countdown=5)
                        links = '{0}https://www.wznav.com/article/{1}\n'.format(
                            links, itm.token)
                if links:
                    notify_baidu_new_url.apply_async((links, ))
                self.session._unique_cache = None
            except Exception as e:
                logger.exception(e)
                self.session.rollback()