Exemple #1
0
def iter_all_data(exist=None):
    from lg_data.db.models import ZHArticle, DBSession
    session = DBSession()
    fail_list = []
    limit = 1000
    total = session.query(func.count(ZHArticle.id)).scalar()
    total_offset = int(math.ceil(total / float(limit)))
    if exist:
        start = exist / limit
        count = start * limit - 1
    else:
        start = 1
        count = 0
    for i in xrange(start, total_offset):
        offset = limit * i
        result = session.query(ZHArticle).order_by('id').limit(limit).offset(
            offset).all()

        for article in result:
            logging.info('Current {0} {1}/{2} {3}%'.format(
                article.token, count + 1, total, (count + 1.0) / total * 100))
            generate_keywords(article)
            count += 1
            try:
                session.commit()
            except Exception as e:
                logging.exception(
                    'ERROR in commit data {0} reason: {1}'.format(article, e))
                session.rollback()
                fail_list.append(article.id)
    logging.info('generate keywords down, fail: {0}'.format(fail_list))
def fix_image_in_article():
    from lg_data.db.models import ZHArticle, DBSession
    session = DBSession()
    fail_list = []
    limit = 1000
    total = session.query(func.count(ZHArticle.id)).scalar()
    total_offset = int(math.ceil(total / float(limit)))
    count = 0
    for i in xrange(total_offset):
        offset = limit * i
        result = session.query(ZHArticle).order_by('id').limit(limit).offset(
            offset).all()

        for article in result:
            logging.info('Current {0} {1}/{2} {3}%'.format(
                article.token, count + 1, total, (count + 1.0) / total * 100))
            if article.cover == '/s/image/default.jpg':
                article.cover = ''
            count += 1
            try:
                session.commit()
            except Exception as e:
                logging.exception(
                    'ERROR in commit data {0} reason: {1}'.format(article, e))
                session.rollback()
                fail_list.append(article.id)
    logging.info('fix image down, fail: {0}'.format(fail_list))
Exemple #3
0
class ProxyDataStorePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        now = datetime.datetime.now()
        host = item['host']
        exist_proxy = self.session.query(Proxy).filter(
            Proxy.host == host).first()
        if exist_proxy:
            exist_proxy.available = True
        else:
            proxy = Proxy(host=item['host'],
                          port=item['port'],
                          create_time=now,
                          modify_time=now,
                          available=True)
            if item['protocol'].upper() == ProtocolChoice.HTTP:
                proxy.protocol = ProtocolChoice.HTTP
            else:
                proxy.protocol = ProtocolChoice.HTTPS
            self.session.add(proxy)
        return item

    def close_spider(self, spider):
        try:
            self.session.commit()
        except Exception as e:
            logger.exception(e)
            self.session.rollback()
        finally:
            self.session.close()
Exemple #4
0
def generate_keywords_task(token):
    from lg_data.queue.utils import generate_keywords

    session = DBSession()
    article = session.query(ZHArticle).filter(ZHArticle.md5 == token).first()
    if not article:
        return False
    generate_keywords(article)
    session.commit()
    return True
Exemple #5
0
def fix_image_in_article(exist=None):
    from lg_data.db.models import ZHArticle, DBSession
    from bs4 import BeautifulSoup
    session = DBSession()
    fail_list = []
    limit = 1000
    total = session.query(func.count(ZHArticle.id)).scalar()
    total_offset = int(math.ceil(total / float(limit)))
    if exist:
        start = exist / limit
        count = start * limit - 1
    else:
        start = 1
        count = 0
    for i in xrange(start, total_offset):
        offset = limit * i
        result = session.query(ZHArticle).order_by('id').limit(limit).offset(
            offset).all()

        for article in result:
            logging.info('Current {0} {1}/{2} {3}%'.format(
                article.token, count + 1, total, (count + 1.0) / total * 100))
            soup = BeautifulSoup(article.content)
            finds = soup.find_all('img')
            for itm in finds:
                host_random = random.randint(1, 4)
                itm['src'] = 'https://pic{0}.zhimg.com/{1}'.format(
                    host_random, itm['src'])
            if not article.cover:
                if finds:
                    article.cover = finds[0]['src']
            article.content = soup.prettify()
            count += 1
            try:
                session.commit()
            except Exception as e:
                logging.exception(
                    'ERROR in commit data {0} reason: {1}'.format(article, e))
                session.rollback()
                fail_list.append(article.id)
    logging.info('fix image down, fail: {0}'.format(fail_list))
Exemple #6
0
class RedisCachePaginator(Paginator):
    def __init__(self,
                 object_list,
                 per_page,
                 orphans=0,
                 allow_empty_first_page=True,
                 *args,
                 **kwargs):
        self.session = DBSession()
        super(RedisCachePaginator,
              self).__init__(object_list, per_page, orphans,
                             allow_empty_first_page)

    def _get_count(self):
        """
        Returns the total number of objects, across all pages.
        """
        if self._count is None:
            count = cache.get('article_count')
            if count:
                self._count = count
                return self._count
            q = self.session.query(ZHArticle)
            self._count = self.get_count_from_db(q)
            cache.set('article_count', self._count, 60 * 60 * 6)
        return self._count

    count = property(_get_count)

    def page(self, number):
        """
        Returns a Page object for the given 1-based page number.
        """
        number = self.validate_number(number)
        bottom = (number - 1) * self.per_page
        top = bottom + self.per_page
        if top + self.orphans >= self.count:
            top = self.count
        self.object_list = models.ZHArticle.objects.raw(
            '{0} OFFSET {1} LIMIT {2}'.format(self.object_list.raw_query,
                                              bottom, self.per_page))
        return self._get_page(self.object_list, number, self)

    def get_count_from_db(self, q):
        count_q = q.statement.with_only_columns([func.count()])
        count = q.session.execute(count_q).scalar()
        return count
Exemple #7
0
class ProxyMiddleware(object):
    session = None
    proxies = None

    def get_proxy(self):
        self.session = DBSession()
        if not self.proxies:
            self.proxies = self.session.query(Proxy).all()
        count = len(self.proxies)
        index = random.randint(0, count - 1)
        return self.proxies[index]

    def process_request(self, request, spider):
        # proxy = self.get_proxy()
        # if proxy.protocol == ProtocolChoice.HTTP:
        #     request.meta['proxy'] = "http://{host}:{port}".format(host=proxy.host, port=proxy.port)
        # else:
        #     request.meta['proxy'] = "https://{host}:{port}".format(host=proxy.host, port=proxy.port)
        request.meta['proxy'] = "https://10.4.18.169:3128"
Exemple #8
0
class ZHPeopleFollowsSpider(scrapy.Spider):
    name = 'follow'
    host = 'https://www.zhihu.com/'
    start_urls = ['https://zhuanlan.zhihu.com/p/20580194']
    user_follower_api = 'https://www.zhihu.com/api/v4/members/{slug}/followers?limit=20&offset={offset}'
    user_followee_api = 'https://www.zhihu.com/api/v4/members/{slug}/followees?limit=20&offset={offset}'
    response = None
    headers = {}

    custom_settings = {
        'ITEM_PIPELINES': {
            # 'Shadow.pipelines.CheckAvailablePipeline': 200,
            'Shadow.pipelines.UserStorePipeline': 300,
        },
        'DOWNLOADER_MIDDLEWARES': {
            'Shadow.middlewares.UserAgentMiddleware': 1,
            # 'Shadow.middlewares.ProxyMiddleware': 2,

        },
        'COOKIES_ENABLED': False,
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'DOWNLOAD_DELAY': 3,
        'CONCURRENT_REQUESTS': 1
    }

    def __init__(self, *args, **kwargs):
        self.session = DBSession()
        self.user = None
        # self.user = self.session.query(ZHUser).filter(ZHUser.crawl_follow == False).first()
        # if not self.user:
        #     raise CloseSpider('No available user follow to crawl, spider exit')
        # self.user.crawl_follow = True
        # self.session.commit()
        super(ZHPeopleFollowsSpider, self).__init__(*args, **kwargs)

    def fetch_obj(self):
        self.user = self.session.query(ZHUser).filter(ZHUser.crawl_follow == False).first()
        return self.user

    def modify_obj(self):
        if self.user:
            self.user.crawl_follow = True
            self.session.commit()
        return self.user

    def start_requests(self):
        while 1:
            url = self.start_urls[0]
            if self.fetch_obj():
                yield self.make_requests_from_url(url)
            else:
                break
        raise CloseSpider('No available user item to crawl follows')

    def get_client_config(self, response):
        matchs = re.findall(r'<textarea id="clientConfig" hidden="">(.*?)</textarea>', response.body)
        html_parser = HTMLParser.HTMLParser()
        unescape_data = html_parser.unescape(matchs[0])
        data = json.loads(unescape_data)
        return data

    def parse(self, response):
        data = self.get_client_config(response)
        tokens = data.get('tokens')
        headers = response.headers
        headers['referer'] = response.url
        headers['authorization'] = tokens.get('Authorization')
        headers['x-xsrf-token'] = tokens.get('X-XSRF-TOKEN')
        self.headers = headers
        url = self.user_follower_api.format(slug=self.user.slug, offset=0)
        yield Request(url, callback=self.parse_follow, headers=self.headers)
        url = self.user_followee_api.format(slug=self.user.slug, offset=0)
        yield Request(url, callback=self.parse_follow, headers=headers)
        self.modify_obj()
        # self.session.close()

    def parse_follow(self, response):
        data = json.loads(response.body)
        pagination = data.get('paging')
        followers = data.get('data', [])
        for follower in followers:
            item = ZHUserItem()
            item['avatar'] = follower.get('avatar_url')
            item['name'] = follower.get('name')
            item['zuid'] = follower.get('id')
            item['slug'] = follower.get('url_token')
            item['hash'] = md5(item['slug'])
            item['headline'] = follower.get('headline')
            item['link'] = 'https://www.zhihu.com/people/{0}'.format(item['slug'])
            item['description'] = ''
            yield item
        is_end = pagination.get('is_end')
        if not is_end:
            url = pagination.get('next')
            yield Request(url, callback=self.parse_follow, headers=self.headers)
Exemple #9
0
class ZhuanLanSpider(scrapy.Spider):
    name = 'zhuanlan'
    host = 'https://zhuanlan.zhihu.com/'
    start_urls = ['https://zhuanlan.zhihu.com/HicRhodushicsalta']
    api_urls = 'https://zhuanlan.zhihu.com/api/columns/{0}/posts?limit=20&offset={1}'
    column_api_url = 'https://zhuanlan.zhihu.com/api/columns/{slug}'
    offset = 0
    total = 0
    url_name = ''
    column = None
    creator = None

    custom_settings = {
        'ITEM_PIPELINES': {
            # 'Shadow.pipelines.CheckAvailablePipeline': 200,
            'Shadow.pipelines.ArticleDataStorePipeline': 300,
            # 'Shadow.pipelines.WechatSenderPipeline': 400,
        },
        'DOWNLOADER_MIDDLEWARES': {
            'Shadow.middlewares.UserAgentMiddleware': 1,
            # 'Shadow.middlewares.ProxyMiddleware': 2,
        },
        'COOKIES_ENABLED': False,
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'CONCURRENT_REQUESTS': 1
    }

    # def __init__(self, *args, **kwargs):
    #     session = DBSession()
    #     self.obj = session.query(ZHRandomColumn).first()
    #     if self.obj:
    #         self.start_urls = [self.obj.link]
    #         session.close()
    #     else:
    #         session.close()
    #         raise CloseSpider("No random column item to crawling")
    #     self.start_urls = ['https://zhuanlan.zhihu.com/chuapp']
    #     super(ZhuanLanSpider, self).__init__(*args, **kwargs)

    def __init__(self, *args, **kwargs):
        self.session = DBSession()
        self.obj = None
        super(ZhuanLanSpider, self).__init__(*args, **kwargs)

    def start_requests(self):
        while 1:
            self.obj = self.session.query(ZHRandomColumn).first()
            if self.obj:
                self.start_urls = [self.obj.link]
                yield self.make_requests_from_url(self.obj.link)
            else:
                break
        self.session.close()
        raise CloseSpider("No item to crawling")

    def modify_obj(self):
        if self.obj:
            try:
                self.session.delete(self.obj)
                self.session.commit()
                self.offset = 0
            except Exception as e:
                logging.exception(e)
                self.session.rollback()
                self.session.close()
                self.session = DBSession()

    def get_zhuanlan_name(self):
        self.url_name = self.start_urls[0].strip('/').split('/')[-1]
        return self.url_name

    def generate_api_url(self, offset):
        self.get_zhuanlan_name()
        self.offset += offset
        return self.api_urls.format(self.url_name, self.offset)

    def get_client_config(self, response):
        matchs = re.findall(
            r'<textarea id="clientConfig" hidden="">(.*?)</textarea>',
            response.body)
        html_parser = HTMLParser.HTMLParser()
        unescape_data = html_parser.unescape(matchs[0])
        data = json.loads(unescape_data)
        return data

    def parse(self, response):
        if response.status == 404:
            self.modify_obj()
        data = self.get_client_config(response)
        tokens = data.get('tokens')
        headers = response.headers
        headers['referer'] = response.url
        headers['authorization'] = tokens.get('Authorization')
        headers['x-xsrf-token'] = tokens.get('X-XSRF-TOKEN')
        url = self.generate_api_url(0)
        yield Request(url, headers=headers, callback=self.parse_api_result)
        url = self.column_api_url.format(slug=self.get_zhuanlan_name())
        yield Request(url, headers=headers, callback=self.parse_column_info)
        self.modify_obj()

    def parse_column_info(self, response):
        data = json.loads(response.body)
        item = ZHColumnItem()
        slug = data.get('slug')
        self.total = int(data.get('postsCount', 0))
        item['name'] = data.get('name')
        item['link'] = 'https://zhuanlan.zhihu.com/{0}'.format(slug)
        item['hash'] = md5('{0}'.format(slug))
        item['slug'] = slug
        item['description'] = data.get('description')
        item['avatar'] = data.get('avatar').get(
            'template', 'https://pic2.zhimg.com/{id}_{size}.jpg').format(
                id=data.get('avatar').get('id'), size='l')
        self.column = item.copy()
        creator = data.get('creator')
        if creator:
            item = ZHUserItem()
            item['zuid'] = creator.get('uid')
            item['name'] = creator.get('name')
            item['link'] = creator.get('profileUrl')
            item['hash'] = creator.get('hash')
            item['slug'] = creator.get('slug')
            item['description'] = creator.get('description')
            item['headline'] = creator.get('bio')
            item['avatar'] = creator.get('avatar').get(
                'template', 'https://pic1.zhimg.com/{id}_{size}.jpg').format(
                    id=creator.get('avatar').get('id'), size='l')
            self.creator = item.copy()

    def parse_api_result(self, response):
        offset = int(response.url.split('&')[-1].split('=')[-1])
        data = json.loads(response.body)
        for article in data:
            item = ZHCombinationItem()
            author = article.get('author', None)
            link = 'https://zhuanlan.zhihu.com/p/{0}'.format(
                article.get('slug'))
            item.article['title'] = article.get('title')
            item.article['content'] = article.get('content')
            item.article['summary'] = article.get('summary')
            item.article['cover'] = article.get('titleImage')
            item.article['token'] = article.get('slug')
            item.article['link'] = link
            item.article['md5'] = md5('{0}'.format(item.article['token']))
            item.article['create_time'] = article.get('publishedTime')
            item.article['modify_time'] = article.get('publishedTime')
            if author.get('hash') == self.creator['hash']:
                item.author = self.creator.copy()
            else:
                item.author['zuid'] = author.get('uid')
                item.author['name'] = author.get('name')
                item.author['link'] = author.get('profileUrl')
                item.author['hash'] = author.get('hash')
                item.author['slug'] = author.get('slug')
                item.author['description'] = author.get('description')
                item.author['headline'] = author.get('headline')
                item.author['avatar'] = author.get('avatar').get(
                    'template',
                    'https://pic1.zhimg.com/{id}_{size}.jpg').format(
                        id=author.get('avatar').get('id'), size='l')
            item.column = self.column
            item.creator = self.creator
            yield item
        if offset < self.total:
            url = self.generate_api_url(20)
            yield Request(url,
                          callback=self.parse_api_result,
                          headers=response.headers)