Esempio n. 1
0
class ProxyDataStorePipeline(object):
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        now = datetime.datetime.now()
        host = item['host']
        exist_proxy = self.session.query(Proxy).filter(
            Proxy.host == host).first()
        if exist_proxy:
            exist_proxy.available = True
        else:
            proxy = Proxy(host=item['host'],
                          port=item['port'],
                          create_time=now,
                          modify_time=now,
                          available=True)
            if item['protocol'].upper() == ProtocolChoice.HTTP:
                proxy.protocol = ProtocolChoice.HTTP
            else:
                proxy.protocol = ProtocolChoice.HTTPS
            self.session.add(proxy)
        return item

    def close_spider(self, spider):
        try:
            self.session.commit()
        except Exception as e:
            logger.exception(e)
            self.session.rollback()
        finally:
            self.session.close()
Esempio n. 2
0
def fix_image_in_article():
    from lg_data.db.models import ZHArticle, DBSession
    session = DBSession()
    fail_list = []
    limit = 1000
    total = session.query(func.count(ZHArticle.id)).scalar()
    total_offset = int(math.ceil(total / float(limit)))
    count = 0
    for i in xrange(total_offset):
        offset = limit * i
        result = session.query(ZHArticle).order_by('id').limit(limit).offset(
            offset).all()

        for article in result:
            logging.info('Current {0} {1}/{2} {3}%'.format(
                article.token, count + 1, total, (count + 1.0) / total * 100))
            if article.cover == '/s/image/default.jpg':
                article.cover = ''
            count += 1
            try:
                session.commit()
            except Exception as e:
                logging.exception(
                    'ERROR in commit data {0} reason: {1}'.format(article, e))
                session.rollback()
                fail_list.append(article.id)
    logging.info('fix image down, fail: {0}'.format(fail_list))
Esempio n. 3
0
class DataBaseRunMixin(object):
    # def start_requests(self):
    #     """Returns a batch of start requests from database."""
    #     req = self.next_requests()
    #     return req.next()

    def fetch_obj(self):
        pass

    def modify_obj(self, obj):
        pass

    def next_requests(self):
        while 1:
            # import pudb;pu.db
            try:
                self.user = self.fetch_obj()
            except Exception as e:
                logging.exception(e)
                self.session.rollback()
                self.session.close()
                self.session = DBSession()
            if not self.user:
                self.session.close()
                break
                # raise CloseSpider('No available user follow to crawl, spider exit')
            req = self.make_requests_from_url('https://zhuanlan.zhihu.com/p/20580194')
            yield req

    def schedule_next_requests(self):
        """Schedules a request if available"""
        if self.user:
            try:
                self.user = self.modify_obj(self.user)
                self.session.commit()
            except Exception as e:
                logging.exception(e)
                self.session.rollback()
                self.session.close()
                self.session = DBSession()
        for req in self.next_requests():
            self.crawler.engine.crawl(req, spider=self)

    def spider_idle(self):
        """Schedules a request if available, otherwise waits."""
        # XXX: Handle a sentinel to close the spider.
        self.schedule_next_requests()
        # raise DontCloseSpider

    def setup_database(self, crawler=None):
        self.session = DBSession()
        if crawler is None:
            # We allow optional crawler argument to keep backwards
            # compatibility.
            # XXX: Raise a deprecation warning.
            crawler = getattr(self, 'crawler', None)

        if crawler is None:
            raise ValueError("crawler is required")
        crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
Esempio n. 4
0
def iter_all_data(exist=None):
    from lg_data.db.models import ZHArticle, DBSession
    session = DBSession()
    fail_list = []
    limit = 1000
    total = session.query(func.count(ZHArticle.id)).scalar()
    total_offset = int(math.ceil(total / float(limit)))
    if exist:
        start = exist / limit
        count = start * limit - 1
    else:
        start = 1
        count = 0
    for i in xrange(start, total_offset):
        offset = limit * i
        result = session.query(ZHArticle).order_by('id').limit(limit).offset(
            offset).all()

        for article in result:
            logging.info('Current {0} {1}/{2} {3}%'.format(
                article.token, count + 1, total, (count + 1.0) / total * 100))
            generate_keywords(article)
            count += 1
            try:
                session.commit()
            except Exception as e:
                logging.exception(
                    'ERROR in commit data {0} reason: {1}'.format(article, e))
                session.rollback()
                fail_list.append(article.id)
    logging.info('generate keywords down, fail: {0}'.format(fail_list))
Esempio n. 5
0
def main():
    session = DBSession()
    for queryset in query_by_pagination(session, ZHArticle):
        for article in queryset:
            fix_href(article)
        try:
            session.commit()
        except Exception as e:
            session.rollback()
Esempio n. 6
0
def fix_image_in_article(exist=None):
    from lg_data.db.models import ZHArticle, DBSession
    from bs4 import BeautifulSoup
    session = DBSession()
    fail_list = []
    limit = 1000
    total = session.query(func.count(ZHArticle.id)).scalar()
    total_offset = int(math.ceil(total / float(limit)))
    if exist:
        start = exist / limit
        count = start * limit - 1
    else:
        start = 1
        count = 0
    for i in xrange(start, total_offset):
        offset = limit * i
        result = session.query(ZHArticle).order_by('id').limit(limit).offset(
            offset).all()

        for article in result:
            logging.info('Current {0} {1}/{2} {3}%'.format(
                article.token, count + 1, total, (count + 1.0) / total * 100))
            soup = BeautifulSoup(article.content)
            finds = soup.find_all('img')
            for itm in finds:
                host_random = random.randint(1, 4)
                itm['src'] = 'https://pic{0}.zhimg.com/{1}'.format(
                    host_random, itm['src'])
            if not article.cover:
                if finds:
                    article.cover = finds[0]['src']
            article.content = soup.prettify()
            count += 1
            try:
                session.commit()
            except Exception as e:
                logging.exception(
                    'ERROR in commit data {0} reason: {1}'.format(article, e))
                session.rollback()
                fail_list.append(article.id)
    logging.info('fix image down, fail: {0}'.format(fail_list))
Esempio n. 7
0
class DataStorePipelineBase(object):
    commit_number = 100

    def __init__(self):
        self.now = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai'))
        self.session = None
        self.count = 0
        self.redis = redis_1
        super(DataStorePipelineBase, self).__init__()

    def get_now(self):
        self.now = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai'))
        return self.now

    def open_spider(self, spider):
        self.session = DBSession()

    def close_spider(self, spider):
        try:
            self.session.commit()
            self.session._unique_cache = None
        except Exception as e:
            logger.exception(e)
            self.session.rollback()
        finally:
            self.session.close()

    def periodic_commit(self):
        self.count += 1
        if self.count == self.commit_number:
            try:
                logger.info('Periodic commit to database')
                self.count = 0
                self.session.commit()
                self.session._unique_cache = None
            except Exception as e:
                logger.exception(e)
                self.session.rollback()
Esempio n. 8
0
class ZhuanLanSpider(scrapy.Spider):
    name = 'zhuanlan'
    host = 'https://zhuanlan.zhihu.com/'
    start_urls = ['https://zhuanlan.zhihu.com/HicRhodushicsalta']
    api_urls = 'https://zhuanlan.zhihu.com/api/columns/{0}/posts?limit=20&offset={1}'
    column_api_url = 'https://zhuanlan.zhihu.com/api/columns/{slug}'
    offset = 0
    total = 0
    url_name = ''
    column = None
    creator = None

    custom_settings = {
        'ITEM_PIPELINES': {
            # 'Shadow.pipelines.CheckAvailablePipeline': 200,
            'Shadow.pipelines.ArticleDataStorePipeline': 300,
            # 'Shadow.pipelines.WechatSenderPipeline': 400,
        },
        'DOWNLOADER_MIDDLEWARES': {
            'Shadow.middlewares.UserAgentMiddleware': 1,
            # 'Shadow.middlewares.ProxyMiddleware': 2,
        },
        'COOKIES_ENABLED': False,
        'RANDOMIZE_DOWNLOAD_DELAY': True,
        'CONCURRENT_REQUESTS': 1
    }

    # def __init__(self, *args, **kwargs):
    #     session = DBSession()
    #     self.obj = session.query(ZHRandomColumn).first()
    #     if self.obj:
    #         self.start_urls = [self.obj.link]
    #         session.close()
    #     else:
    #         session.close()
    #         raise CloseSpider("No random column item to crawling")
    #     self.start_urls = ['https://zhuanlan.zhihu.com/chuapp']
    #     super(ZhuanLanSpider, self).__init__(*args, **kwargs)

    def __init__(self, *args, **kwargs):
        self.session = DBSession()
        self.obj = None
        super(ZhuanLanSpider, self).__init__(*args, **kwargs)

    def start_requests(self):
        while 1:
            self.obj = self.session.query(ZHRandomColumn).first()
            if self.obj:
                self.start_urls = [self.obj.link]
                yield self.make_requests_from_url(self.obj.link)
            else:
                break
        self.session.close()
        raise CloseSpider("No item to crawling")

    def modify_obj(self):
        if self.obj:
            try:
                self.session.delete(self.obj)
                self.session.commit()
                self.offset = 0
            except Exception as e:
                logging.exception(e)
                self.session.rollback()
                self.session.close()
                self.session = DBSession()

    def get_zhuanlan_name(self):
        self.url_name = self.start_urls[0].strip('/').split('/')[-1]
        return self.url_name

    def generate_api_url(self, offset):
        self.get_zhuanlan_name()
        self.offset += offset
        return self.api_urls.format(self.url_name, self.offset)

    def get_client_config(self, response):
        matchs = re.findall(
            r'<textarea id="clientConfig" hidden="">(.*?)</textarea>',
            response.body)
        html_parser = HTMLParser.HTMLParser()
        unescape_data = html_parser.unescape(matchs[0])
        data = json.loads(unescape_data)
        return data

    def parse(self, response):
        if response.status == 404:
            self.modify_obj()
        data = self.get_client_config(response)
        tokens = data.get('tokens')
        headers = response.headers
        headers['referer'] = response.url
        headers['authorization'] = tokens.get('Authorization')
        headers['x-xsrf-token'] = tokens.get('X-XSRF-TOKEN')
        url = self.generate_api_url(0)
        yield Request(url, headers=headers, callback=self.parse_api_result)
        url = self.column_api_url.format(slug=self.get_zhuanlan_name())
        yield Request(url, headers=headers, callback=self.parse_column_info)
        self.modify_obj()

    def parse_column_info(self, response):
        data = json.loads(response.body)
        item = ZHColumnItem()
        slug = data.get('slug')
        self.total = int(data.get('postsCount', 0))
        item['name'] = data.get('name')
        item['link'] = 'https://zhuanlan.zhihu.com/{0}'.format(slug)
        item['hash'] = md5('{0}'.format(slug))
        item['slug'] = slug
        item['description'] = data.get('description')
        item['avatar'] = data.get('avatar').get(
            'template', 'https://pic2.zhimg.com/{id}_{size}.jpg').format(
                id=data.get('avatar').get('id'), size='l')
        self.column = item.copy()
        creator = data.get('creator')
        if creator:
            item = ZHUserItem()
            item['zuid'] = creator.get('uid')
            item['name'] = creator.get('name')
            item['link'] = creator.get('profileUrl')
            item['hash'] = creator.get('hash')
            item['slug'] = creator.get('slug')
            item['description'] = creator.get('description')
            item['headline'] = creator.get('bio')
            item['avatar'] = creator.get('avatar').get(
                'template', 'https://pic1.zhimg.com/{id}_{size}.jpg').format(
                    id=creator.get('avatar').get('id'), size='l')
            self.creator = item.copy()

    def parse_api_result(self, response):
        offset = int(response.url.split('&')[-1].split('=')[-1])
        data = json.loads(response.body)
        for article in data:
            item = ZHCombinationItem()
            author = article.get('author', None)
            link = 'https://zhuanlan.zhihu.com/p/{0}'.format(
                article.get('slug'))
            item.article['title'] = article.get('title')
            item.article['content'] = article.get('content')
            item.article['summary'] = article.get('summary')
            item.article['cover'] = article.get('titleImage')
            item.article['token'] = article.get('slug')
            item.article['link'] = link
            item.article['md5'] = md5('{0}'.format(item.article['token']))
            item.article['create_time'] = article.get('publishedTime')
            item.article['modify_time'] = article.get('publishedTime')
            if author.get('hash') == self.creator['hash']:
                item.author = self.creator.copy()
            else:
                item.author['zuid'] = author.get('uid')
                item.author['name'] = author.get('name')
                item.author['link'] = author.get('profileUrl')
                item.author['hash'] = author.get('hash')
                item.author['slug'] = author.get('slug')
                item.author['description'] = author.get('description')
                item.author['headline'] = author.get('headline')
                item.author['avatar'] = author.get('avatar').get(
                    'template',
                    'https://pic1.zhimg.com/{id}_{size}.jpg').format(
                        id=author.get('avatar').get('id'), size='l')
            item.column = self.column
            item.creator = self.creator
            yield item
        if offset < self.total:
            url = self.generate_api_url(20)
            yield Request(url,
                          callback=self.parse_api_result,
                          headers=response.headers)