예제 #1
0
def get_store_deps(params):
    """
        Method to extract the departments in retailer
    """
    dep_list = []
    br_stats = {}
    try:
        store_id = params['external_id']
        # Prepare request
        br = ByRequest(attempts=1)
        br.add_proxy(OXYLABS, attempts=3, name='Oxylabs')
        logger.debug('[ByRequest] Rquesting {}'.format(url_store.format(store_id)))
        response = br.get(url_store.format(store_id), return_json=True)
        br_stats = br.stats
        ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)

        if response:
            # Add departments
            for dep in response:
                dep_list.append(extract_info(dep))

        else:
            err_st = 'Could not get response for {}'.format(url_store.format(store_id))
            logger.error(err_st)
            stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=0, reason=str(err_st))
                
    except Exception as e:
        err_st = "Unexpected error in get_store_deps: {}".format(e)
        ws_id = stream_monitor('worker', step='start', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
        es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st)
        logger.error(err_st)
        logger.debug(params)
    return dep_list
예제 #2
0
def process_prod(raw_prod, params):
    prod_cl = {}
    ws_id = stream_monitor('worker', step=params.get('route_key'), value=1, ms_id=params['ms_id'], store_id=params['store_id'])
    try:
        prod_cl = {
            'route_key' : params['route_key'],
            'retailer' : params['retailer_key'],
            'name' : raw_prod.get('name'),
            'id' :  raw_prod.get('product_id'),
            'url' : url_product.format(raw_prod.get('id')),
            'gtin' : raw_prod.get('ean'),
            'date' : str(datetime.datetime.utcnow()),
            'description' : raw_prod.get('description'),
            'brand' : raw_prod.get('trademark'),
            'provider' : '',
            'ingredients' : [],
            'images' : [
                url_image.format(raw_prod.get('image'))
            ],
            'raw_attributes' : [
                {
                    'key'  : 'content',
                    'value': raw_prod.get('quantity'),
                    'unit' : raw_prod.get('unit_type')
                }
            ],
            'raw_ingredients' : '',
            'price' : float(raw_prod.get('price')) if raw_prod.get('price') is not None else None,
            'price_original' : float(raw_prod.get('real_price')) if raw_prod.get('real_price') is not None else None,
            'discount' : float(raw_prod.get('discount')) if raw_prod.get('discount') is not None else None,
            'promo' : '',
            'location' : {
                'store' : [
                    params['store_uuid']
                ]
            }
        }
        # logger.debug(prod_cl)
    except Exception as e:
        err_st = "Unexpected error in process_prod: {}".format(e)
        ws_id = stream_monitor('worker', step=params.get('route_key'), value=1, ms_id=params['ms_id'], store_id=params['store_id'])
        es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st)
        logger.error(err_st)
    return prod_cl
예제 #3
0
def get_stores(params):
    errors = []
    br_stats = {}
    try:
        # Obtain Rappi stores for each ZIP
        for zip_code in get_zip():
            process_zip.apply_async(args=(zip_code,), queue=CELERY_QUEUE)

        if len(errors) > 0:
            ws_id = stream_monitor('worker', step='store', ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
            for error in errors:
                stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason))
        else:
            stream_monitor('worker', step='store', ms_id=params['ms_id'], store_id=params['store_id'])

    except Exception as e:
        ws_id = stream_monitor('worker', step='store', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
        es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=str(e))
        logger.error("Error in get_stores: " + str(e))
    return True
예제 #4
0
def get_stores(params):
    errors = []
    url_zip = "http://" + SRV_GEOLOCATION + "/place/get_places?zip={}"
    br_stats = {}
    try:
        # Obtain Rappi stores for each ZIP
        for zip_code in get_zip():
            logger.debug('[ByRequests] Requesting {}'.format(
                url_zip.format(zip_code)))
            response = br.get(url_zip.format(zip_code), return_json=True)
            br_stats = br.stats

            if isinstance(response, dict):
                logger.debug('Resp is dict')
                places = response.get('places', [])

                logger.debug(places)

                for place in places:
                    gral_data = {
                        "state": place.get('state'),
                        "country": "México",
                        "city": place.get('city'),
                        "zip": place.get('zip'),
                        "name": 'Rappi ' + raw_st.get('name'),
                        "retailer": get_ret_key(raw_st.get('name'))
                    }
                    print(gral_data)

                    get_stores_from_coords.apply_async(args=(place['lat'],
                                                             place['lng'],
                                                             gral_data),
                                                       queue=CELERY_QUEUE)
                    # get_stores_from_coords(place['lat'], place['lng'], gral_data)
            else:
                err_st = 'Could not get right response from {}'.format(
                    url_zip.format(zip_code))
                errors.append(MonitorException(code=2, reason=err_st))
                logger.error(err_st)

        if len(errors) > 0:
            ws_id = stream_monitor('worker',
                                   step='store',
                                   ms_id=params['ms_id'],
                                   store_id=params['store_id'],
                                   br_stats=br_stats)
            for error in errors:
                stream_monitor('error',
                               ws_id=ws_id,
                               store_id=params['store_id'],
                               code=error.code,
                               reason=str(error.reason))
        else:
            stream_monitor('worker',
                           step='store',
                           ms_id=params['ms_id'],
                           store_id=params['store_id'])

    except Exception as e:
        ws_id = stream_monitor('worker',
                               step='store',
                               value=1,
                               ms_id=params['ms_id'],
                               store_id=params['store_id'],
                               br_stats=br_stats)
        es_id = stream_monitor('error',
                               ws_id=ws_id,
                               store_id=params['store_id'],
                               code=2,
                               reason=str(e))
        logger.error("Error in : " + str(e))
    return True
예제 #5
0

# Main method
if __name__ == '__main__':
    logger.info("Started master scraper: " + CELERY_QUEUE +
                " / scraper_type: " + str(SCRAPER_TYPE))
    if SCRAPER_TYPE and len(SCRAPER_TYPE) > 0:
        if SCRAPER_TYPE == 'price' or SCRAPER_TYPE == 'item':
            # Fetch Valid Stores
            sts_to_crawl = request_valid_stores(retailers_to_get,
                                                str(SCRAPER_TYPE))
            logger.debug(sts_to_crawl[0])
            # Number of stores to crawl
            num_stores = range(0, len(sts_to_crawl))
            ms_id = stream_monitor('master',
                                   params=sts_to_crawl[0],
                                   num_stores=len(sts_to_crawl))
            logger.info("Crawling {} stores!".format(len(sts_to_crawl)))
            # Call to crawl all stores async
            for s in num_stores:
                logger.debug("Calling to scrape")
                call_scraper(sts_to_crawl[s], ms_id)
                # call_parallel(sts_to_crawl[s], ms_id)
        elif SCRAPER_TYPE == 'store':
            logger.debug("CALLING STORES")
            ms_id = stream_monitor('master', params={})
            st_id = 1
            call_stores(ms_id, st_id)
        else:
            logger.warning(
                'Please indicate the argument type of scraping process')
예제 #6
0
def crawl_cat(dep_name, scat, params, page=1, next_id=None, run_all=True):
    br_stats = {}
    br = ByRequest(attempts=1)
    br.add_proxy(OXYLABS, attempts=3, name='Oxylabs')
    errors = []

    # Url creation
    url = url_cat.format(scat['id'], params['external_id'], LIMIT)
    if next_id is not None:
        url = url + '&next_id={}'.format(next_id)

    logger.debug('[ByRequest] Requesting {}'.format(url))
    
    try:
        response = br.get(url, return_json=True)
        br_stats = br.stats
        next_id = None
        prod_raw_ls = []
        prods_ls = []
        cat_ls = [dep_name, scat['name']]

        # Product list extraction
        if isinstance(response, dict):
            next_id = response.get('next_id')
            result = response.get('results', [])
            for res in result:
                prod_raw_ls.extend(res.get('products', []))
        else:
            err_st = 'Could not get response from {}'.format(url)
            logger.error(err_st)
            errors.append(MonitorException(code=0, reason=err_st))

        # Check if there are more products to crawl
        n_prod = len(prod_raw_ls)
        logger.info('Found {} products, page {} for {} | {}'.format(str(n_prod).ljust(3), str(page).ljust(2), params['retailer_key'], ' | '.join(cat_ls)))

        if (next_id is not None) and run_all:
            logger.debug('Found next page...')
            # crawl_cat(dep_name, scat, params, page=page+1, next_id=next_id)
            crawl_cat.apply_async(args=(dep_name, scat, params, page+1, next_id), queue=CELERY_QUEUE)
        for prod in prod_raw_ls:
            try:
                prod_clean = process_prod(prod, params)
                if prod_clean:
                    prod_clean.update({
                        'categories': cat_ls,
                    })
                    prods_ls.append(prod_clean)
                    stream_info(prod_clean)
                else:
                    err = 'Could not get product'
                    logger.error(err)
                    raise Exception(err)
            except Exception as exe:
                err_st = 'Error with product: {}'.format(prod)
                logger.error(err_st)
                errors.append(MonitorException(code=2, reason=err_st))
                
        if len(errors) > 0:
            ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
            for error in errors:
                stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=error.code, reason=str(error.reason))
        else:
            stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'])

    except Exception as e:
        err_st = "Unexpected error in crawl_cat: {}".format(e)
        ws_id = stream_monitor('worker', step='category', value=1, ms_id=params['ms_id'], store_id=params['store_id'], br_stats=br_stats)
        es_id = stream_monitor('error', ws_id=ws_id, store_id=params['store_id'], code=2, reason=err_st)
        logger.error(err_st)
    return prods_ls