def get_store_deps(params): """ Method to extract the departments in retailer """ dep_list = [] br_stats = {} try: store_id = params['external_id'] # Prepare request br = ByRequest(attempts=1) br.add_proxy(OXYLABS, attempts=3, name='Oxylabs') logger.debug('[ByRequest] Rquesting {}'.format(url_store.format(store_id))) response = br.get(url_store.format(store_id), return_json=True) br_stats = br.stats ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) if response: # Add departments for dep in response: dep_list.append(extract_info(dep)) else: err_st = 'Could not get response for {}'.format(url_store.format(store_id)) logger.error(err_st) stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=0, reason=str(err_st)) except Exception as e: err_st = "Unexpected error in get_store_deps: {}".format(e) ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st) logger.error(err_st) logger.debug(params) return dep_list
def get_stores_from_coords(lat, lng, gral_data={}): url_coord = "https://services.mxgrability.rappi.com/api/base-crack/principal?lat={}&lng={}&device=2" br = ByRequest(attempts=2) br.add_proxy(OXYLABS, attempts=5, name="Oxylabs") stores_ls = [] lat = frmt_coord(lat) lng = frmt_coord(lng) logger.debug('[ByRequest] Requesting {}'.format(url_coord.format(lat, lng))) resp = br.get(url_coord.format(lat, lng), return_json=True) if isinstance(resp, list): logger.debug('Got response') # pprint(resp) stores_ls = extract_stores(resp) else: logger.error('Not a valid response, check if the site changed') for raw_st in stores_ls: try: for loc in raw_st.get('locations', []): clean_store = create_st_dict(loc) if isinstance(clean_store, dict): clean_store.update(gral_data) stream_info(clean_store) except Exception as ex: err_st = 'Error with store {}'.format(raw_st) errors.append(MonitorException(code=3, reason=err_st)) logger.error(err_st) logger.info('Found {} stores'.format(len(stores_ls))) return stores_ls
def __init__(self): self.br = ByRequest(attempts=1) self.br.add_proxy(OXYLABS, attempts=3, name='Oxylabs') self._access_token = None self.url_auth = 'https://services.mxgrability.rappi.com/api/auth/guest_access_token' self.url_content = 'https://services.mxgrability.rappi.com/api/dynamic/context/content' self.base_url = 'https://www.rappi.com.mx/' self.url_image = 'https://images.rappi.com.mx/products/{}' self.url_product = 'https://www.rappi.com.mx/product/{}' self.dep_list = [] self.cat_list = [] self.product_list = [] self.total_products = 0
# -*- coding: utf-8 -*- import datetime from pprint import pprint from worker import app import pandas as pd from ByHelpers import applogger from ByHelpers.rabbit_engine import (MonitorException, stream_info, stream_monitor) from ByRequests.ByRequests import ByRequest from config import OXYLABS, SRV_GEOLOCATION, CELERY_QUEUE logger = applogger.get_logger() br = ByRequest(attempts=2) br.add_proxy(OXYLABS, attempts=5, name="Oxylabs") stores_dict = { 'la_comer': { 'name': 'La Comer', 'key': 'rappi_la_comer' }, 'costco': { 'name': 'Costco', 'key': 'rappi_costco' }, 'chedraui': { 'name': 'Chedraui', 'key': 'rappi_chedraui' },
# -*- coding: utf-8 -*- import json import os import random import sys from ByHelpers import applogger from ByRequests.ByRequests import ByRequest from ByHelpers.rabbit_engine import stream_monitor from config import * from worker import crawl_store, start_stores, logger br = ByRequest(attempts=2) # URLS geoloc_host = 'http://' + str(SRV_GEOLOCATION) stores_endp_url = geoloc_host + '/store/retailer?key=%s' #Variables retailer_key = 'rappi' retailer_name = 'Rappi' retailers_to_get = [ 'rappi_farmazone', 'rappi_farmacias_similares', 'rappi_benavides', 'rappi_farmaciasguadalajara' ] def call_scraper(params, ms_id): """ Call to crawl async elements
class StoreCrawler(): def __init__(self): self.br = ByRequest(attempts=1) self.br.add_proxy(OXYLABS, attempts=3, name='Oxylabs') self._access_token = None self.url_auth = 'https://services.mxgrability.rappi.com/api/auth/guest_access_token' self.url_content = 'https://services.mxgrability.rappi.com/api/dynamic/context/content' self.base_url = 'https://www.rappi.com.mx/' self.url_image = 'https://images.rappi.com.mx/products/{}' self.url_product = 'https://www.rappi.com.mx/product/{}' self.dep_list = [] self.cat_list = [] self.product_list = [] self.total_products = 0 def perform_request(self, url, headers={}, json={}, method='GET', return_json=True, require_auth=False): logger.debug('[ByRequest] Requesting {}'.format(url)) proxies = { "http": OXYLABS, "https": OXYLABS, } global_headers = self.br.headers global_headers.update(headers) if require_auth: global_headers.update({'Authorization': f'Bearer {self.access_token}'}) response = requests.request(method, url, headers=global_headers, json=json, proxies=proxies) # response = self.br.request(method, url, headers=global_headers, json=json) if response.status_code >= 300: if require_auth: self.get_auth() global_headers.update({'Authorization': f'Bearer {self.access_token}'}) response = requests.request(method, url, headers=global_headers, json=json, proxies=proxies) # response = self.br.request(method, url, headers=global_headers, json=json) if response and return_json: try: response = response.json() except Exception as e: response = {} return response def get_auth(self): headers = { 'Accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json' } body = { "headers": { "normalizedNames": {}, "lazyUpdate": None }, "grant_type": "guest" } response = self.perform_request(self.url_auth, method='POST', headers=headers, json=body) if response: access_token = response.get('access_token') self._access_token = access_token return access_token @property def access_token(self): if not self._access_token: self.get_auth() return self._access_token def get_store_departments(self, params): """ Method to extract the departments for given store """ headers = { 'language': 'es', 'Content-Type': 'application/json', 'app-version': 'web_4.0.6' } local_products = [] try: store_id = params['external_id'] # Prepare request url = self.url_content body = { "state": { "lat": str(params['coords']['lat']), "lng": str(params['coords']['lng']), "parent_store_type": None, "store_type": None }, "limit": 10, "offset": 0, "context": "store_home", "stores": [int(store_id)] } response = self.perform_request(url, method='POST', headers=headers, json=body, require_auth=True) if response: # Add departments for element in response['data']['components']: if element.get('name') == 'aisles_icons_carousel': for cat in element['resource'].get('aisle_icons', []): self.dep_list.append(self.extract_info(cat)) resource_products = element['resource'].get('products', []) local_products.extend(resource_products) else: err_st = 'Could not get department response for {}'.format(url) logger.error(err_st) self.product_list.extend(local_products) logger.info('Found {} departments in {} [{}]'.format(len(self.dep_list), params['retailer_key'], store_id)) except Exception as e: err_st = "Unexpected error in get_store_departments: {}".format(e) logger.error(err_st) logger.debug(params) return self.dep_list def get_store_categories(self, params): """ Method to extract the categories for given store """ headers = { 'language': 'es', 'Content-Type': 'application/json', 'app-version': 'web_4.0.6' } local_products = [] try: store_id = params['external_id'] # Prepare request url = self.url_content body = { "state": {}, "limit": 100, "offset": 0, "context": "aisles_tree", "stores": [int(store_id)] } # body = { # "state": { # "aisle_id": "0", # "parent_id": "0" # }, # "limit": 10, # "context": "sub_aisles", # "stores": [int(store_id)] # } #for i in range(0, len(self.dep_list), 10): if True: #body["offset"] = i response = self.perform_request(url, method='POST', headers=headers, json=body, require_auth=True) if response: # Add categories for cat in response['data']['components']: if cat.get('name') == 'aisles_tree': self.cat_list.append(self.extract_info(cat['resource'])) resource_products = cat['resource'].get('products', []) local_products.extend(resource_products) else: err_st = 'Could not get categories response for store {} - {}'.format(params['retailer_key'], store_id) logger.error(err_st) logger.debug(pformat(body)) self.product_list.extend(local_products) logger.info('Found {} categories in {} [{}]'.format(len(self.cat_list), params['retailer_key'], store_id)) except Exception as e: err_st = "Unexpected error in get_store_categories: {}".format(e) logger.error(err_st) logger.debug(params) return self.cat_list def get_category_products(self, category_dict, params): """ Method to extract the products for given category """ headers = { 'language': 'es', 'Content-Type': 'application/json', 'app-version': 'web_4.0.6' } more_items = True offset = 0 category_products = [] try: store_id = params['external_id'] # Prepare request url = self.url_content body = { "state": { "aisle_id": str(category_dict['id']) }, "limit": 10, "context": "aisle_detail", "stores": [int(store_id)] } while more_items: local_products = [] body["offset"] = offset response = self.perform_request(url, method='POST', headers=headers, json=body, require_auth=True) if response: # Add products for element in response['data']['components']: if element.get('name') == 'aisle_detail': resource_products = element['resource'].get('products') local_products.extend(resource_products) else: err_st = 'Could not get product response for category {} - {}'.format(category_dict['name'], category_dict['id']) logger.error(err_st) logger.debug(pformat(body)) # retry offset += 10 if len(local_products) < 60: more_items = False category_products.extend(local_products) logger.info('Found {} products in {} | {} [{}]'.format(len(category_products), category_dict['name'], params['retailer_key'], store_id)) self.product_list.extend(category_products) except Exception as e: err_st = "Unexpected error in get_category_products: {}".format(e) logger.error(err_st) logger.debug(params) return category_products def extract_info(self, raw_dep_dict): children = raw_dep_dict.get('categories', []) if children: children = [self.extract_info(child) for child in children] n_dict = { 'name' : raw_dep_dict.get('name'), 'id': raw_dep_dict.get('id'), 'children': children, 'children_count': raw_dep_dict.get('children_count', len(children)), 'total_products': raw_dep_dict.get('quantity_products') } return n_dict def process_product(self, params, raw_prod): clean_product = { 'route_key' : params['route_key'], 'retailer' : params['retailer_key'], 'name' : raw_prod.get('name'), 'id' : raw_prod.get('product_id'), 'url' : self.url_product.format(raw_prod.get('id')), 'gtin' : raw_prod.get('ean'), 'date' : str(datetime.datetime.utcnow()), 'description' : raw_prod.get('description'), 'brand' : raw_prod.get('trademark'), 'provider' : '', 'ingredients' : [], 'images' : [ self.url_image.format(raw_prod.get('image')) ], 'raw_attributes' : [ { 'key' : 'content', 'value': raw_prod.get('quantity'), 'unit' : raw_prod.get('unit_type') } ], 'raw_ingredients' : '', 'price' : float(raw_prod.get('price')) if raw_prod.get('price') is not None else None, 'price_original' : float(raw_prod.get('real_price')) if raw_prod.get('real_price') is not None else None, 'discount' : float(raw_prod.get('discount')) if raw_prod.get('discount') is not None else None, 'promo' : '', 'location' : { 'store' : [ params['store_uuid'] ] } } return clean_product def send_products(self, params): logger.info('Found {} products to send {} [{}]'.format(len(self.product_list), params['retailer_key'], params['external_id'])) for product in self.product_list: clean_product = self.process_product(params, product) try: stream_info(clean_product) self.total_products += 1 except Exception as e: err_str = 'Could not send product_id {}: {}'.format(clean_product.get('id'), e) logger.error(err_str) logger.info('Sent {} products {} [{}]'.format(self.total_products, params['retailer_key'], params['external_id'])) def crawl_store(self, params): # self.get_store_departments(params) # time.sleep(1) self.get_store_categories(params) time.sleep(0.5) for category_dict in self.cat_list: for subcategory_dict in category_dict['children']: self.get_category_products(subcategory_dict, params) time.sleep(random.randint(2, 4)) self.send_products(params) return self.total_products
def crawl_cat(dep_name, scat, params, page=1, next_id=None, run_all=True): br_stats = {} br = ByRequest(attempts=1) br.add_proxy(OXYLABS, attempts=3, name='Oxylabs') errors = [] # Url creation url = url_cat.format(scat['id'], params['external_id'], LIMIT) if next_id is not None: url = url + '&next_id={}'.format(next_id) logger.debug('[ByRequest] Requesting {}'.format(url)) try: response = br.get(url, return_json=True) br_stats = br.stats next_id = None prod_raw_ls = [] prods_ls = [] cat_ls = [dep_name, scat['name']] # Product list extraction if isinstance(response, dict): next_id = response.get('next_id') result = response.get('results', []) for res in result: prod_raw_ls.extend(res.get('products', [])) else: err_st = 'Could not get response from {}'.format(url) logger.error(err_st) errors.append(MonitorException(code=0, reason=err_st)) # Check if there are more products to crawl n_prod = len(prod_raw_ls) logger.info('Found {} products, page {} for {} | {}'.format(str(n_prod).ljust(3), str(page).ljust(2), params['retailer_key'], ' | '.join(cat_ls))) if (next_id is not None) and run_all: logger.debug('Found next page...') # crawl_cat(dep_name, scat, params, page=page+1, next_id=next_id) crawl_cat.apply_async(args=(dep_name, scat, params, page+1, next_id), queue=CELERY_QUEUE) for prod in prod_raw_ls: try: prod_clean = process_prod(prod, params) if prod_clean: prod_clean.update({ 'categories': cat_ls, }) prods_ls.append(prod_clean) stream_info(prod_clean) else: err = 'Could not get product' logger.error(err) raise Exception(err) except Exception as exe: err_st = 'Error with product: {}'.format(prod) logger.error(err_st) errors.append(MonitorException(code=2, reason=err_st)) if len(errors) > 0: ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) for error in errors: stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason)) else: stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id']) except Exception as e: err_st = "Unexpected error in crawl_cat: {}".format(e) ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats) es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st) logger.error(err_st) return prods_ls