def parse_more_location(self,response):
        item = LocTagItem()
        location_data = json.loads(response.body)

        city_id = location_data['city_info']['id']
        city_name = location_data['city_info']['slug']
        for location in location_data['location_list']:
            item['loc_tag_name'] = location['slug']
            item['loc_id'] = location['id']
            item['loc_parent_name'] = city_name
            item['loc_parent_id'] = city_id
            item['loc_country_name'] = location_data['country_info']['name']
            item['is_crawled'] = 0
            yield item


        if location_data['next_page'] != None:
            yield FormRequest(url='https://www.instagram.com/explore/locations/{loc_parent_id}/'.format(loc_parent_id=item['loc_parent_id']),
                              formdata={'page': str(location_data['next_page'])},
                              headers=self.more_headers,
                              callback=self.parse_more_location,
                              errback=self.report_error
                              )

        else:
            city_set_crawled(city_id)   # set is_crawled 1
            utils.send_mail('**finish the city:%s**'%(city_id), self.name, 'jason', 'crawl_city_success')
            self.log('**************************finish the city:%s**************************'%(city_id))
Example #2
0
    def process_item(self, item, spider):
        if isinstance(item, CarImEntranceItem):
            """保存汽车之家各个车型图片入口url"""
            car_entrance = AiCarEntrance(
                entran_im_url = item['entran_im_url'],
                entran_brand = item['entran_brand'],
                entran_series=item['entran_series'],
                entran_series_id = item['entran_series_id'],
                is_crawled = 0
            )
            with session_scope(self.Session, 'insert_car_entrance') as session:
                session.add(car_entrance)
            return item

        elif isinstance(item, CarPicItem):
            """按车型保存汽车之家图片"""
            try:
                car_folder = item['car_model']
                image_fold = img_path+'homeOfcars/'+car_folder+'/'
                image_name = image_fold + item['pic_id'] + '.jpg'
                print('****************'+'         '+image_name)
                if not os.path.exists(image_fold):
                    # os.mkdir(img_path+img_folder+'/')
                    os.makedirs(image_fold)
                with open(image_name, 'wb') as f:
                    f.write(item['pic_content'])
            except Exception as e:
                utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'CarPicItem_exception')
            return item
Example #3
0
def get_citys(limt_num, country):
    citys=[]
    try:
        citys = session.query(LocCityTag).filter(and_(LocCityTag.is_crawled == 0, LocCityTag.city_country_name == country)).limit(limt_num)
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_query_city',
                        'jason',
                        'crawl_exception')
        # session.rollback()
    return citys
Example #4
0
def city_set_crawled(city_id,is_crawled=1):
    try:
        session.query(LocCityTag).filter(LocCityTag.loc_city_id == city_id).update({LocCityTag.is_crawled: is_crawled})
        session.commit()
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_update_city',
                        'jason',
                        'crawl_exception')
        session.rollback()
    finally:
        # session.close()
        pass
def get_img_loc(limt_num, country):
    img_locs = []
    try:
        img_locs = session.query(ImgLocTag).filter(
            and_(ImgLocTag.is_crawled == 0,
                 ImgLocTag.loc_country_name == country)).limit(limt_num)
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_query_loc', 'jason',
                        'crawl_exception')
    finally:
        # session.close()
        pass
    return img_locs
Example #6
0
def get_ins_user(crawl_id):
    ins_user = []
    try:
        ins_user = session.query(InsUser).filter(InsUser.crawl_id == crawl_id)
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_query_InsUser', 'jason',
                        'crawl_exception')
    finally:
        # session.close()
        pass
    return ins_user


# Base.metadata.create_all(bind=engine)
def loc_set_crawled(loc_id, is_crawled):
    try:
        session.query(ImgLocTag).filter(ImgLocTag.loc_id == loc_id).update(
            {ImgLocTag.is_crawled: is_crawled})
        session.commit()
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_update_loc', 'jason',
                        'crawl_exception')
        session.rollback()
    finally:
        # session.close()
        pass


# Base.metadata.create_all(bind=engine)
Example #8
0
def get_config(config_name, config_lable):
    session = DBSession()
    config_object = None
    try:
        config_object = session.query(CrawlConfig).filter(
            and_(CrawlConfig.config_name == config_name,
                 CrawlConfig.config_lable == config_lable)).first()
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_query_CrawlConfig', 'jason',
                        'crawl_exception')
    finally:
        session.close()
        pass
    return config_object.config_value


# Base.metadata.create_all(bind=engine)
def set_crawled(user_id, image_counts, crawled_time, is_crawled=1):
    try:
        session.query(ImgUser).filter(ImgUser.img_user_id == user_id).update({
            ImgUser.is_crawled:
            is_crawled,
            ImgUser.image_counts:
            image_counts,
            ImgUser.crawled_time:
            crawled_time
        })
        session.commit()
    except Exception as e:
        print(str(e))
        utils.send_mail(str(e), 'db_session_update_user', 'jason',
                        'crawl_exception')
        session.rollback()
    finally:
        # session.close()
        pass


# Base.metadata.create_all(bind=engine)
Example #10
0
    def process_response(self, request, response, spider):
        if 'ins_im' in spider.name:

            if 'query/?query_hash=' in response.url:
                cookiejar_name = request.meta['cookiejar']
                if response.status == 429:
                    spider.cookies_dict[cookiejar_name]['is_useful'] = 1
                    spider.cookies_dict[cookiejar_name]['time_p'] = int(
                        time.time())
                elif response.status == 200:
                    spider.cookies_dict[cookiejar_name]['is_useful'] = 0
                    # print(response.meta)
                elif response.status == 403:
                    utils.send_mail(
                        '%s:this account:%s not along account or api change' %
                        (spider.name, request.meta['cookiejar']), spider.name,
                        'jason', 'crawl_exception')
                    spider.cookies_dict[cookiejar_name]['is_useful'] = 2

                # print('spider.cookies_dict----->',spider.cookies_dict)
            if 'https://www.instagram.com/accounts/login/ajax/' in response.url:
                cookiejar_name = request.meta['cookiejar']
                print(cookiejar_name, 'is logining')
                if response.status == 200:
                    print(cookiejar_name, '---->login success')
                    spider.cookies_dict[cookiejar_name] = {
                        'is_useful': 0,
                        'time_p': 0
                    }
                else:
                    print(cookiejar_name, '---->login fail')
                    utils.send_mail(
                        '%s:this account:%s login fail to yanzhen' %
                        (spider.name, request.meta['cookiejar']), spider.name,
                        'jason', 'crawl_exception')

            if 'https://www.instagram.com/challenge/?query_hash' in response.url:
                cookiejar_name = request.meta['cookiejar']
                if response.status == 200:
                    utils.send_mail(
                        'query_challenge %s:this account:%s need to yanzhen' %
                        (spider.name, request.meta['cookiejar']), spider.name,
                        'jason', 'crawl_exception')
                    # spider.cookies_dict[cookiejar_name] = {'is_useful': 2, 'time_p': 0}

        return response
 def report_error(self, failure):
     utils.send_mail(repr(failure.value),self.name,'jason','crawl_exception')
 def report_error(self, failure):
     if failure.value.response.status != 404 and failure.value.response.status != 500:
         utils.send_mail(self.name+'__'+socket.gethostname() + '__' + repr(failure.value.response), self.name, 'jason',
                     'crawl_exception')
Example #13
0
    def process_item(self, item, spider):
        if isinstance(item, CityTagItem):
            city_tag = LocCityTag(
                loc_city_id= item['loc_city_id'],
                loc_city_name = item['loc_city_name'],
                loc_parent_id = item['loc_parent_id'],
                loc_parent_name = item['loc_parent_name'],
                city_country_name= item['city_country_name'],
                is_crawled = item['is_crawled'],
            )
            try:
                self.session.add(city_tag)
                self.session.commit()
            except Exception as e:
                print(str(e))
                utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'CityTagItem_exception')
                self.session.rollback()
            return item

        elif isinstance(item, LocTagItem):
            img_loc_tag = ImgLocTag(
                loc_tag_name = item['loc_tag_name'],
                loc_id = item['loc_id'],
                loc_parent_name = item['loc_parent_name'],
                loc_parent_id = item['loc_parent_id'],
                loc_country_name = item['loc_country_name'],
                is_crawled = item['is_crawled']
            )
            try:
                self.session.add(img_loc_tag)
                self.session.commit()
            except Exception as e:
                print(str(e))
                utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'LocTagItem_exception')
                self.session.rollback()
            return item

        elif isinstance(item, ImgUserItem):
            img_user = ImgUser(
                img_user_name = item['img_user_name'],
                img_user_id = item['img_user_id'],
                user_country_name = item['user_country_name'],
                is_crawled = item['is_crawled'],
                user_profile_url = item['user_profile_url']
            )
            try:
                # user_list = self.session.query(ImgUser).filter(ImgUser.img_user_id == item['img_user_id']).limit(1)
                # user = ImgUser.query.filter_by(img_user_id == item['img_user_id']).first()
                # # print('user_list-->',user)
                # if not user:
                self.session.add(img_user)
                self.session.commit()
                # else:

            except Exception as e:
                # print(str(e))
                print("*++*************Duplicate entry %s for key img_user_id*************++*'" % item['img_user_id'])
                self.session.rollback()
            return item

        elif isinstance(item, ImageItem):
            # save image to file
            try:
                face_folder = item['img_owner_name']+'_'+item['img_owner_id']
                # image_fold = img_path+'image_data/'+face_folder+'/'
                image_fold = img_path + item['img_save_fold'] + '/' + face_folder + '/'
                image_name = image_fold + item['img_id'] + '.jpg'
                # print('****************'+'         '+image_name)
                if not os.path.exists(image_fold):
                    # os.mkdir(img_path+img_folder+'/')
                    os.makedirs(image_fold)
                with open(image_name, 'wb') as f:
                    f.write(item['img_content'])
                    print('save imge success')
                item['img_content'] = 'img_content'


                # save mongodb
                # image_dict = {'url':item['img_url'], 'all_data':item['img_data']}
                # self.mg_table.insert(image_dict)
            except Exception as e:
                utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'ImageItem_exception')
            return item
        elif isinstance(item, GoogleImItem):
            try:
                # img_folder = item['img_name']+'_'+item['img_owner_id']
                image_cate = ''
                if item['img_url'].endswith('.png'):
                    image_cate='.png'
                else:
                    image_cate='.jpg'


                img_folder = img_path + 'google_data/'+item['img_lable'] + '/'
                image_name = img_folder + item['img_id'] + image_cate
                # print('****************'+'         '+image_name)
                if not os.path.exists(img_folder):
                    # os.mkdir(img_path+img_folder+'/')
                    os.makedirs(img_folder)
                with open(image_name, 'wb') as f:
                    f.write(item['img_content'])

                # save mongodb
                # image_dict = {'url':item['img_url'], 'all_data':item['img_data']}
                # self.mg_table.insert(image_dict)
            except Exception as e:
                utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'ImageItem_exception')
            return item

        elif isinstance(item, InsTagItem):
            # save image to file
            try:
                face_folder = item['tag_name']
                image_fold = img_path+'instagram_data/'+face_folder+'/'
                image_name = image_fold + item['img_id'] + '.jpg'
                # print('****************'+'         '+image_name)
                if not os.path.exists(image_fold):
                    # os.mkdir(img_path+img_folder+'/')
                    os.makedirs(image_fold)
                with open(image_name, 'wb') as f:
                    f.write(item['img_content'])

                # save mongodb
                # image_dict = {'url':item['img_url'], 'all_data':item['img_data']}
                # self.mg_table.insert(image_dict)
            except Exception as e:
                utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'ImageItem_exception')
            return item