コード例 #1
0
class JiepaiThreeAppSpider(scrapy.Spider):
    name = 'jiepai_three_app'
    allowed_domains = ['app.3ajiepai.com']
    start_urls = [
        'http://app.3ajiepai.com/thread/list?fid=170&page=1&pageSize=20'
    ]

    def __init__(self, name=None, **kwargs):
        super(JiepaiThreeAppSpider, self).__init__(name)
        self.cur_time = time_utils.get_jie_pai_three_m_scrapy_time()
        self.cookies_jie_pai = {}
        self.bmob_helper = BMobUploadHelper()

    def start_requests(self):
        jsession_id, jie_pai = self.get_login_info()
        self.cookies_jie_pai = {
            '__cfduid': 'd038136efebfcd498fc25c12f2a9cbad81539412011',
            'JSESSIONID': jsession_id,
            '3ajiepai': jie_pai
        }
        for url in self.start_urls:
            yield scrapy.Request(url, cookies=self.cookies_jie_pai)

    def get_login_info(self):
        login_url = "http://app.3ajiepai.com/wechat/login?code=onQGp1RAFbnzN6m4y259Qma2vMu4"
        response = requests.get(login_url)
        jsession_id = response.cookies["JSESSIONID"]
        jie_pai = response.cookies["3ajiepai"]
        logging.info("response cookies: " + str(response.cookies))
        return jsession_id, jie_pai

    def parse(self, response):
        logging.info("jiepai_three_app response: " + response.body)
        json_content = json.loads(response.body)

        data_array = json_content["data"]
        count = 0
        for data_item in data_array:
            # count +=1
            # if count ==2:
            #     break
            #time
            data_date = data_item["dateline"]
            timeArray = time.localtime(int(data_date))
            jie_pai_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
            if cmp(jie_pai_time, self.cur_time) < 0:
                logging.info("time is out of date, jie_pai_time: " +
                             jie_pai_time)
                return

            # 封面
            data_thumb = data_item["thumb"]
            # title
            data_subject = data_item["subject"]
            #id
            data_tid = data_item["tid"]

            detial_url = "http://app.3ajiepai.com/thread/" + str(data_tid)

            yield scrapy.Request(detial_url,
                                 meta={
                                     "data_thumb": data_thumb,
                                     "data_subject": data_subject,
                                 },
                                 cookies=self.cookies_jie_pai,
                                 callback=self.handle_detail)

        # 所有的事情都办完了
        time_utils.save_jie_pai_three_m_scrapy_time(
            time_utils.get_next_day_time())

    def handle_detail(self, response):
        data_thumb = response.meta["data_thumb"]
        data_subject = response.meta["data_subject"]
        data_detail = json.loads(response.body)
        photos = data_detail["data"]["photos"]
        point_group_id = ""
        sub_pic_url = "https://api2.bmob.cn/1/classes/CardPicBean"
        first_photo = True
        for photo in photos:
            img_url = photo["origin"]
            # 第一张:
            if first_photo:
                group_content = self.bmob_helper.get_group_content(
                    img_url, data_subject)
                group_url = "https://api2.bmob.cn/1/classes/CardPicGroup"
                logging.info("parse_wei_bo_detail group data: " +
                             json.dumps(group_content, ensure_ascii=False))
                point_group_id = self.bmob_helper.upload_to_bmob(
                    group_url, group_content)
                first_photo = False
            else:
                detail_content = self.bmob_helper.get_detail_content(
                    "", img_url, point_group_id)
                logging.info("upload sub_pics json: " +
                             json.dumps(detail_content, ensure_ascii=False))
                self.bmob_helper.upload_to_bmob(sub_pic_url, detail_content)
コード例 #2
0
class JiepaiWeiBoSpider(scrapy.Spider):
    name = 'jiepai_wei_bo'
    allowed_domains = ['weibo.com']
    # start_urls = ['http://photo.weibo.com/1304494805/talbum/index#!/mode/2/page/1']
    start_urls = [
        'https://weibo.com/u/1304494805?is_all=1',  #街拍美
        'https://weibo.com/u/3757458303?is_all=1',  #街拍摄美
        # 'https://weibo.com/tajiepai?is_all=1'#她街拍
    ]

    def __init__(self, name=None, **kwargs):
        super(JiepaiWeiBoSpider, self).__init__(name)
        self.bmob_helper = BMobUploadHelper()
        self.key_words = ["四年"]
        self.craw_count = 0

    def hit_key_word(self, word):
        result = False
        for my_word in self.key_words:
            result = my_word in word.encode("utf-8")
            if result:
                return result
        return result

    def parse(self, response):
        bsp = BeautifulSoup(response.body, 'lxml')
        wei_bo_group = bsp.select_one(".WB_feed.WB_feed_v3.WB_feed_v4")
        time.sleep(1)
        wei_bo_items = wei_bo_group.select(".WB_feed_detail.clearfix")
        count = 0
        title = ""
        cur_wei_bo_time = time_utils.get_jie_pai_wei_bo_scrapy_time()
        cur_group_id = ""
        point_group_id = ""
        for wei_bo_item in wei_bo_items:

            # time
            wei_bo_time_item = wei_bo_item.select_one(".WB_from.S_txt2 > a")
            wei_bo_time = wei_bo_time_item["title"]

            if cmp(wei_bo_time, cur_wei_bo_time) < 0:
                logging.info("time is out of date, wei_bo_time: " +
                             wei_bo_time)
                continue

            # titile
            wei_bo_title_item = wei_bo_item.select_one(".WB_text.W_f14")
            wei_bo_title = wei_bo_title_item.text
            wei_bo_title = wei_bo_title.replace("\n", "").strip()
            reobj = re.compile("\(.*\)")
            wei_bo_title_result, number = reobj.subn("", wei_bo_title)

            #通过关键字过滤一些微博
            if self.hit_key_word(wei_bo_title):
                logging.info("hit_key_word title: " + wei_bo_title)
                continue
            img_urls = []
            #pic
            wei_bo_pics = wei_bo_item.select(".WB_pic")
            for wei_bo_pic in wei_bo_pics:
                img_item = wei_bo_pic.select_one("img")
                img_url = img_item["src"]
                final_img_url = ""
                if "thumb150" in img_url:
                    final_img_url = "http:" + img_url.replace(
                        "thumb150", "mw690")
                elif "orj360" in img_url:
                    final_img_url = "http:" + img_url.replace(
                        "orj360", "mw690")
                img_urls.append(final_img_url)

            if len(img_urls) > 0:
                #cover
                cover_url = img_urls[0]

                #upload cover
                group_content = self.bmob_helper.get_group_content(
                    cover_url, wei_bo_title_result)
                group_url = "https://api2.bmob.cn/1/classes/CardPicGroup"
                logging.info("parse_wei_bo_detail group data: " +
                             json.dumps(group_content, ensure_ascii=False))
                point_group_id = self.bmob_helper.upload_to_bmob(
                    group_url, group_content)

                #upload sub_pics
                sub_pic_url = "https://api2.bmob.cn/1/classes/CardPicBean"
                for index in range(1, len(img_urls)):
                    detail_content = self.bmob_helper.get_detail_content(
                        "", img_urls[index], point_group_id)
                    logging.info(
                        "upload sub_pics json: " +
                        json.dumps(detail_content, ensure_ascii=False))
                    self.bmob_helper.upload_to_bmob(sub_pic_url,
                                                    detail_content)

        self.craw_count = self.craw_count + 1

        if self.craw_count == len(self.start_urls):
            # 所有的事情都办完了
            time_utils.save_jie_pai_weibo_scrapy_time(
                time_utils.get_next_day_time())