def parse_more_location(self,response): item = LocTagItem() location_data = json.loads(response.body) city_id = location_data['city_info']['id'] city_name = location_data['city_info']['slug'] for location in location_data['location_list']: item['loc_tag_name'] = location['slug'] item['loc_id'] = location['id'] item['loc_parent_name'] = city_name item['loc_parent_id'] = city_id item['loc_country_name'] = location_data['country_info']['name'] item['is_crawled'] = 0 yield item if location_data['next_page'] != None: yield FormRequest(url='https://www.instagram.com/explore/locations/{loc_parent_id}/'.format(loc_parent_id=item['loc_parent_id']), formdata={'page': str(location_data['next_page'])}, headers=self.more_headers, callback=self.parse_more_location, errback=self.report_error ) else: city_set_crawled(city_id) # set is_crawled 1 utils.send_mail('**finish the city:%s**'%(city_id), self.name, 'jason', 'crawl_city_success') self.log('**************************finish the city:%s**************************'%(city_id))
def process_item(self, item, spider): if isinstance(item, CarImEntranceItem): """保存汽车之家各个车型图片入口url""" car_entrance = AiCarEntrance( entran_im_url = item['entran_im_url'], entran_brand = item['entran_brand'], entran_series=item['entran_series'], entran_series_id = item['entran_series_id'], is_crawled = 0 ) with session_scope(self.Session, 'insert_car_entrance') as session: session.add(car_entrance) return item elif isinstance(item, CarPicItem): """按车型保存汽车之家图片""" try: car_folder = item['car_model'] image_fold = img_path+'homeOfcars/'+car_folder+'/' image_name = image_fold + item['pic_id'] + '.jpg' print('****************'+' '+image_name) if not os.path.exists(image_fold): # os.mkdir(img_path+img_folder+'/') os.makedirs(image_fold) with open(image_name, 'wb') as f: f.write(item['pic_content']) except Exception as e: utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'CarPicItem_exception') return item
def get_citys(limt_num, country): citys=[] try: citys = session.query(LocCityTag).filter(and_(LocCityTag.is_crawled == 0, LocCityTag.city_country_name == country)).limit(limt_num) except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_query_city', 'jason', 'crawl_exception') # session.rollback() return citys
def city_set_crawled(city_id,is_crawled=1): try: session.query(LocCityTag).filter(LocCityTag.loc_city_id == city_id).update({LocCityTag.is_crawled: is_crawled}) session.commit() except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_update_city', 'jason', 'crawl_exception') session.rollback() finally: # session.close() pass
def get_img_loc(limt_num, country): img_locs = [] try: img_locs = session.query(ImgLocTag).filter( and_(ImgLocTag.is_crawled == 0, ImgLocTag.loc_country_name == country)).limit(limt_num) except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_query_loc', 'jason', 'crawl_exception') finally: # session.close() pass return img_locs
def get_ins_user(crawl_id): ins_user = [] try: ins_user = session.query(InsUser).filter(InsUser.crawl_id == crawl_id) except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_query_InsUser', 'jason', 'crawl_exception') finally: # session.close() pass return ins_user # Base.metadata.create_all(bind=engine)
def loc_set_crawled(loc_id, is_crawled): try: session.query(ImgLocTag).filter(ImgLocTag.loc_id == loc_id).update( {ImgLocTag.is_crawled: is_crawled}) session.commit() except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_update_loc', 'jason', 'crawl_exception') session.rollback() finally: # session.close() pass # Base.metadata.create_all(bind=engine)
def get_config(config_name, config_lable): session = DBSession() config_object = None try: config_object = session.query(CrawlConfig).filter( and_(CrawlConfig.config_name == config_name, CrawlConfig.config_lable == config_lable)).first() except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_query_CrawlConfig', 'jason', 'crawl_exception') finally: session.close() pass return config_object.config_value # Base.metadata.create_all(bind=engine)
def set_crawled(user_id, image_counts, crawled_time, is_crawled=1): try: session.query(ImgUser).filter(ImgUser.img_user_id == user_id).update({ ImgUser.is_crawled: is_crawled, ImgUser.image_counts: image_counts, ImgUser.crawled_time: crawled_time }) session.commit() except Exception as e: print(str(e)) utils.send_mail(str(e), 'db_session_update_user', 'jason', 'crawl_exception') session.rollback() finally: # session.close() pass # Base.metadata.create_all(bind=engine)
def process_response(self, request, response, spider): if 'ins_im' in spider.name: if 'query/?query_hash=' in response.url: cookiejar_name = request.meta['cookiejar'] if response.status == 429: spider.cookies_dict[cookiejar_name]['is_useful'] = 1 spider.cookies_dict[cookiejar_name]['time_p'] = int( time.time()) elif response.status == 200: spider.cookies_dict[cookiejar_name]['is_useful'] = 0 # print(response.meta) elif response.status == 403: utils.send_mail( '%s:this account:%s not along account or api change' % (spider.name, request.meta['cookiejar']), spider.name, 'jason', 'crawl_exception') spider.cookies_dict[cookiejar_name]['is_useful'] = 2 # print('spider.cookies_dict----->',spider.cookies_dict) if 'https://www.instagram.com/accounts/login/ajax/' in response.url: cookiejar_name = request.meta['cookiejar'] print(cookiejar_name, 'is logining') if response.status == 200: print(cookiejar_name, '---->login success') spider.cookies_dict[cookiejar_name] = { 'is_useful': 0, 'time_p': 0 } else: print(cookiejar_name, '---->login fail') utils.send_mail( '%s:this account:%s login fail to yanzhen' % (spider.name, request.meta['cookiejar']), spider.name, 'jason', 'crawl_exception') if 'https://www.instagram.com/challenge/?query_hash' in response.url: cookiejar_name = request.meta['cookiejar'] if response.status == 200: utils.send_mail( 'query_challenge %s:this account:%s need to yanzhen' % (spider.name, request.meta['cookiejar']), spider.name, 'jason', 'crawl_exception') # spider.cookies_dict[cookiejar_name] = {'is_useful': 2, 'time_p': 0} return response
def report_error(self, failure): utils.send_mail(repr(failure.value),self.name,'jason','crawl_exception')
def report_error(self, failure): if failure.value.response.status != 404 and failure.value.response.status != 500: utils.send_mail(self.name+'__'+socket.gethostname() + '__' + repr(failure.value.response), self.name, 'jason', 'crawl_exception')
def process_item(self, item, spider): if isinstance(item, CityTagItem): city_tag = LocCityTag( loc_city_id= item['loc_city_id'], loc_city_name = item['loc_city_name'], loc_parent_id = item['loc_parent_id'], loc_parent_name = item['loc_parent_name'], city_country_name= item['city_country_name'], is_crawled = item['is_crawled'], ) try: self.session.add(city_tag) self.session.commit() except Exception as e: print(str(e)) utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'CityTagItem_exception') self.session.rollback() return item elif isinstance(item, LocTagItem): img_loc_tag = ImgLocTag( loc_tag_name = item['loc_tag_name'], loc_id = item['loc_id'], loc_parent_name = item['loc_parent_name'], loc_parent_id = item['loc_parent_id'], loc_country_name = item['loc_country_name'], is_crawled = item['is_crawled'] ) try: self.session.add(img_loc_tag) self.session.commit() except Exception as e: print(str(e)) utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'LocTagItem_exception') self.session.rollback() return item elif isinstance(item, ImgUserItem): img_user = ImgUser( img_user_name = item['img_user_name'], img_user_id = item['img_user_id'], user_country_name = item['user_country_name'], is_crawled = item['is_crawled'], user_profile_url = item['user_profile_url'] ) try: # user_list = self.session.query(ImgUser).filter(ImgUser.img_user_id == item['img_user_id']).limit(1) # user = ImgUser.query.filter_by(img_user_id == item['img_user_id']).first() # # print('user_list-->',user) # if not user: self.session.add(img_user) self.session.commit() # else: except Exception as e: # print(str(e)) print("*++*************Duplicate entry %s for key img_user_id*************++*'" % item['img_user_id']) self.session.rollback() return item elif isinstance(item, ImageItem): # save image to file try: face_folder = item['img_owner_name']+'_'+item['img_owner_id'] # image_fold = img_path+'image_data/'+face_folder+'/' image_fold = img_path + item['img_save_fold'] + '/' + face_folder + '/' image_name = image_fold + item['img_id'] + '.jpg' # print('****************'+' '+image_name) if not os.path.exists(image_fold): # os.mkdir(img_path+img_folder+'/') os.makedirs(image_fold) with open(image_name, 'wb') as f: f.write(item['img_content']) print('save imge success') item['img_content'] = 'img_content' # save mongodb # image_dict = {'url':item['img_url'], 'all_data':item['img_data']} # self.mg_table.insert(image_dict) except Exception as e: utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'ImageItem_exception') return item elif isinstance(item, GoogleImItem): try: # img_folder = item['img_name']+'_'+item['img_owner_id'] image_cate = '' if item['img_url'].endswith('.png'): image_cate='.png' else: image_cate='.jpg' img_folder = img_path + 'google_data/'+item['img_lable'] + '/' image_name = img_folder + item['img_id'] + image_cate # print('****************'+' '+image_name) if not os.path.exists(img_folder): # os.mkdir(img_path+img_folder+'/') os.makedirs(img_folder) with open(image_name, 'wb') as f: f.write(item['img_content']) # save mongodb # image_dict = {'url':item['img_url'], 'all_data':item['img_data']} # self.mg_table.insert(image_dict) except Exception as e: utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'ImageItem_exception') return item elif isinstance(item, InsTagItem): # save image to file try: face_folder = item['tag_name'] image_fold = img_path+'instagram_data/'+face_folder+'/' image_name = image_fold + item['img_id'] + '.jpg' # print('****************'+' '+image_name) if not os.path.exists(image_fold): # os.mkdir(img_path+img_folder+'/') os.makedirs(image_fold) with open(image_name, 'wb') as f: f.write(item['img_content']) # save mongodb # image_dict = {'url':item['img_url'], 'all_data':item['img_data']} # self.mg_table.insert(image_dict) except Exception as e: utils.send_mail(str(e), 'ImagespiderPipeline', 'jason', 'ImageItem_exception') return item