print(products[produrl]) searches[kw].append(produrl) if len(set(searches[kw])) == number_of_pdcts_in_kw_search: break else: number_of_pdcts_in_kw_search = len(set(searches[kw])) # if not r.from_cache: # sleep(2) print(kw, p, len(searches[kw])) # Download the pages - with selenium brm = BrandMatcher() for url in sorted(list(set(products))): d = products[url] if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands: print(d['pdct_name_on_eretailer']) url_mod = clean_url(url, root_url=root_url) r = requests.get(url_mod, headers) with open( '/tmp/' + d['pdct_name_on_eretailer'].replace('/', "-") + '.html', 'wb') as f: f.write(r.content) tree = etree.parse(BytesIO(r.content), parser=parser) products[url].update({ 'volume': ' '.join(''.join( tree.xpath('//*[@class="item-meta txt"]//text()')).split()), 'pdct_img_main_url':
searches[kw] = [] for page in range(1, 10): r = session.get('https://www.mybottleshop.com.au/catalogsearch/result/?p={page}&q={kw}'.format( page=page, kw=quote_plus(kw))) tree = etree.parse(BytesIO(r.content), parser=parser) articles = tree.xpath('id("em-grid-mode")/ul[1]/li') aurls = [a.xpath('.//h2[@itemprop="name"]/a/@href')[0] for a in articles] if not articles or all(a in searches[kw] for a in aurls): break searches[kw] += aurls [getproduct(a) for a in articles] print(kw, len(articles), len(searches[kw])) brm = BrandMatcher() for url, product in products.items(): if brm.find_brand(product['pdct_name_on_eretailer'])['brand'] in mh_brands: print(url) fname = fpath_namer(shop_id, 'pdct', product['pdct_name_on_eretailer'], 0) if not op.exists(fname): driver.get(url) sleep(2) driver.save_page(fname, scroll_to_bottom=True) tree = etree.parse(open(fname), parser=parser) data = { 'pdct_img_main_url': tree.xpath('//meta[@property="og:image"]/@content')[0], } product.update(data) # Download images for url, pdt in products.items(): if 'pdct_img_main_url' in pdt and pdt['pdct_img_main_url'] and brm.find_brand(pdt['pdct_name_on_eretailer'])['brand'] in mh_brands:
w for t in li.xpath('.//span[@class="price_amount"]/@content') for w in t.split()).strip(), } print(kw, products[produrl]) products[produrl]['price'] = getprice(products[produrl]['raw_price']) print(kw, products[produrl]) assert all(products[produrl][k] for k in products[produrl]) if not r.from_cache: sleep(3) print(kw, len(searches[kw])) # Download the pages brm = BrandMatcher() for url in sorted(products): d = products[url] if brm.find_brand(d['pdct_name_on_eretailer'])['brand'] in mh_brands: print(d['pdct_name_on_eretailer']) url_mod = clean_url(url, root_url=root_url) r = session.get(url_mod, cookies=cookies) with open( '/tmp/' + shop_id + ' ' + d['pdct_name_on_eretailer'].replace('/', "-") + '.html', 'wb') as f: f.write(r.content) tree = etree.parse(BytesIO(r.content), parser=parser) products[url] = { 'pdct_name_on_eretailer': ' '.join(w for t in tree.xpath( '//h1[@class="buying-controls_title"]//text()') for w in t.split()).strip(), 'volume':
fpath = fpath_namer(shop_id, 'search', kw, p) if not op.exists(fpath): sleep(2) driver.smooth_scroll() driver.save_page(fpath, scroll_to_bottom=True) searches, products = kw_parsing(fpath, kw, searches, products) print(kw, len(searches[kw])) ###################################### # # Download images ########### ###################################### brm = BrandMatcher() for url, pdt in products.items(): if 'pdct_img_main_url' in pdt and pdt['pdct_img_main_url'] and \ brm.find_brand(pdt['pdct_name_on_eretailer'], special_country='JP')['brand'] in mh_brands: print(pdt['pdct_name_on_eretailer'] + "." + pdt['pdct_img_main_url'].split('.')[-1]) print(pdt['pdct_img_main_url']) response = requests.get(pdt['pdct_img_main_url'], stream=True, verify=False, headers=headers) # response.raw.decode_content = True tmp_file_path = '/tmp/' + shop_id + 'mhers_tmp_{}.imgtype'.format( abs(hash(pdt['pdct_img_main_url']))) img_path = img_path_namer(shop_id, pdt['pdct_name_on_eretailer']) with open(tmp_file_path, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) if imghdr.what(tmp_file_path) is not None: img_path = img_path.split('.')[0] + '.' + imghdr.what(
for segment in ['SPECIAL', "ACCESSIBLE", "EXCLUSIVE"]: tmp = pd.DataFrame(df.copy()) tmp['segment'] = segment tmp['must_have'] = 1 * (tmp[segment] == 'Y') tdf = tdf.append(tmp) tdf.drop(columns=[ 'SPECIAL', "ACCESSIBLE", "EXCLUSIVE", '_merge', 'to_delete_flagship_pdct_of_brnd' ], errors='ignore', inplace=True) final_cols = [ 'continent', 'country', 'segment', 'ctg', 'brnd', 'brnd_query', 'pdct_name', 'pdct_quality_name', 'pdct_query', 'pdct_family', 'pdct_order', 'brnd_order', 'abs_pdct_order', 'ref_pdct_key_viseo', 'flagship', 'must_have', 'source', 'priority', 'min_price', 'max_price', 'competitor', 'competitor_query', 'competitor_brnd', 'competitor_volume_in_ml', 'volume_in_ml', 'box', 'rose', 'vintage', 'program', 'pdct_names_equivalents', 'words_to_include', 'tolerance05', 'exclude_terms', 'words_to_include_05', 'pdct_img_ref_path', 'competitor_min_price', 'competitor_max_price' ] brm = BrandMatcher() tdf['competitor_brnd'] = df['competitor'].apply( lambda x: brm.find_brand(x)['brand'] if x == x else '') print("Differences in columns", set(final_cols) ^ set(tdf.columns)) tdf[final_cols].to_excel(op.join(BASE_DIR, "ressources/pdcts_jp.xlsx"), index=None) print(f"soffice '{op.join(BASE_DIR, 'ressources/pdcts_jp.xlsx')}'")