def get_store_deps(params): """ Method to extract the departments in retailer """ dep_list = [] br_stats = {} try: store_id = params['external_id'] # Prepare request br = ByRequest(attempts=1) br.add_proxy(OXYLABS, attempts=3, name='Oxylabs') logger.debug('[ByRequest] Rquesting {}'.format(url_store.format(store_id))) response = br.get(url_store.format(store_id), return_json=True) br_stats = br.stats ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) if response: # Add departments for dep in response: dep_list.append(extract_info(dep)) else: err_st = 'Could not get response for {}'.format(url_store.format(store_id)) logger.error(err_st) stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=0, reason=str(err_st)) except Exception as e: err_st = "Unexpected error in get_store_deps: {}".format(e) ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st) logger.error(err_st) logger.debug(params) return dep_list
def process_prod(raw_prod, params): prod_cl = {} ws_id = stream_monitor('worker', step=params.get('route_key'), value=1, ms_id=params['ms_id'], store_id=params['store_id']) try: prod_cl = { 'route_key' : params['route_key'], 'retailer' : params['retailer_key'], 'name' : raw_prod.get('name'), 'id' : raw_prod.get('product_id'), 'url' : url_product.format(raw_prod.get('id')), 'gtin' : raw_prod.get('ean'), 'date' : str(datetime.datetime.utcnow()), 'description' : raw_prod.get('description'), 'brand' : raw_prod.get('trademark'), 'provider' : '', 'ingredients' : [], 'images' : [ url_image.format(raw_prod.get('image')) ], 'raw_attributes' : [ { 'key' : 'content', 'value': raw_prod.get('quantity'), 'unit' : raw_prod.get('unit_type') } ], 'raw_ingredients' : '', 'price' : float(raw_prod.get('price')) if raw_prod.get('price') is not None else None, 'price_original' : float(raw_prod.get('real_price')) if raw_prod.get('real_price') is not None else None, 'discount' : float(raw_prod.get('discount')) if raw_prod.get('discount') is not None else None, 'promo' : '', 'location' : { 'store' : [ params['store_uuid'] ] } } # logger.debug(prod_cl) except Exception as e: err_st = "Unexpected error in process_prod: {}".format(e) ws_id = stream_monitor('worker', step=params.get('route_key'), value=1, ms_id=params['ms_id'], store_id=params['store_id']) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st) logger.error(err_st) return prod_cl
def get_stores(params): errors = [] br_stats = {} try: # Obtain Rappi stores for each ZIP for zip_code in get_zip(): process_zip.apply_async(args=(zip_code,), queue=CELERY_QUEUE) if len(errors) > 0: ws_id = stream_monitor('worker', step='store', ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) for error in errors: stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason)) else: stream_monitor('worker', step='store', ms_id=params['ms_id'], store_id=params['store_id']) except Exception as e: ws_id = stream_monitor('worker', step='store', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=str(e)) logger.error("Error in get_stores: " + str(e)) return True
def get_stores(params): errors = [] url_zip = "http://" + SRV_GEOLOCATION + "/place/get_places?zip={}" br_stats = {} try: # Obtain Rappi stores for each ZIP for zip_code in get_zip(): logger.debug('[ByRequests] Requesting {}'.format( url_zip.format(zip_code))) response = br.get(url_zip.format(zip_code), return_json=True) br_stats = br.stats if isinstance(response, dict): logger.debug('Resp is dict') places = response.get('places', []) logger.debug(places) for place in places: gral_data = { "state": place.get('state'), "country": "México", "city": place.get('city'), "zip": place.get('zip'), "name": 'Rappi ' + raw_st.get('name'), "retailer": get_ret_key(raw_st.get('name')) } print(gral_data) get_stores_from_coords.apply_async(args=(place['lat'], place['lng'], gral_data), queue=CELERY_QUEUE) # get_stores_from_coords(place['lat'], place['lng'], gral_data) else: err_st = 'Could not get right response from {}'.format( url_zip.format(zip_code)) errors.append(MonitorException(code=2, reason=err_st)) logger.error(err_st) if len(errors) > 0: ws_id = stream_monitor('worker', step='store', ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) for error in errors: stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason)) else: stream_monitor('worker', step='store', ms_id=params['ms_id'], store_id=params['store_id']) except Exception as e: ws_id = stream_monitor('worker', step='store', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=str(e)) logger.error("Error in : " + str(e)) return True
# Main method if __name__ == '__main__': logger.info("Started master scraper: " + CELERY_QUEUE + " / scraper_type: " + str(SCRAPER_TYPE)) if SCRAPER_TYPE and len(SCRAPER_TYPE) > 0: if SCRAPER_TYPE == 'price' or SCRAPER_TYPE == 'item': # Fetch Valid Stores sts_to_crawl = request_valid_stores(retailers_to_get, str(SCRAPER_TYPE)) logger.debug(sts_to_crawl[0]) # Number of stores to crawl num_stores = range(0, len(sts_to_crawl)) ms_id = stream_monitor('master', params=sts_to_crawl[0], num_stores=len(sts_to_crawl)) logger.info("Crawling {} stores!".format(len(sts_to_crawl))) # Call to crawl all stores async for s in num_stores: logger.debug("Calling to scrape") call_scraper(sts_to_crawl[s], ms_id) # call_parallel(sts_to_crawl[s], ms_id) elif SCRAPER_TYPE == 'store': logger.debug("CALLING STORES") ms_id = stream_monitor('master', params={}) st_id = 1 call_stores(ms_id, st_id) else: logger.warning( 'Please indicate the argument type of scraping process')
def crawl_cat(dep_name, scat, params, page=1, next_id=None, run_all=True): br_stats = {} br = ByRequest(attempts=1) br.add_proxy(OXYLABS, attempts=3, name='Oxylabs') errors = [] # Url creation url = url_cat.format(scat['id'], params['external_id'], LIMIT) if next_id is not None: url = url + '&next_id={}'.format(next_id) logger.debug('[ByRequest] Requesting {}'.format(url)) try: response = br.get(url, return_json=True) br_stats = br.stats next_id = None prod_raw_ls = [] prods_ls = [] cat_ls = [dep_name, scat['name']] # Product list extraction if isinstance(response, dict): next_id = response.get('next_id') result = response.get('results', []) for res in result: prod_raw_ls.extend(res.get('products', [])) else: err_st = 'Could not get response from {}'.format(url) logger.error(err_st) errors.append(MonitorException(code=0, reason=err_st)) # Check if there are more products to crawl n_prod = len(prod_raw_ls) logger.info('Found {} products, page {} for {} | {}'.format(str(n_prod).ljust(3), str(page).ljust(2), params['retailer_key'], ' | '.join(cat_ls))) if (next_id is not None) and run_all: logger.debug('Found next page...') # crawl_cat(dep_name, scat, params, page=page+1, next_id=next_id) crawl_cat.apply_async(args=(dep_name, scat, params, page+1, next_id), queue=CELERY_QUEUE) for prod in prod_raw_ls: try: prod_clean = process_prod(prod, params) if prod_clean: prod_clean.update({ 'categories': cat_ls, }) prods_ls.append(prod_clean) stream_info(prod_clean) else: err = 'Could not get product' logger.error(err) raise Exception(err) except Exception as exe: err_st = 'Error with product: {}'.format(prod) logger.error(err_st) errors.append(MonitorException(code=2, reason=err_st)) if len(errors) > 0: ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) for error in errors: stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason)) else: stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id']) except Exception as e: err_st = "Unexpected error in crawl_cat: {}".format(e) ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st) logger.error(err_st) return prods_ls