def save_published_id(self, published_offer_list): print( "Salving Ids") # Porcessamentos por categoria for category in categories: print("\t{0}".format( category[ 'name' ] )) # Pega a lista de ofertas salva localmente local_offers = IOUtil.json_to_dic( 'offers.json', category[ 'id' ] ) count_remove = 0 # Percorre todas as ofertas locais for local_offer in tqdm( reversed( local_offers ) ): # Buscando por link - a ofertas local tem de está na lista publicada - a menos que foi recusada published_offer = self.find_offer(local_offer, published_offer_list, field='link') # Se a não está publicada, em teoria é porque foi recusada e deve sair da base local if published_offer is None: # Deletando a oferta localmente self.process_delete( [local_offer] ) local_offers.remove( local_offer ) count_remove += 1 else: local_offer['published_id'] = published_offer['id'] # Salvando as atualizações print("Amount of removed offers:", count_remove) IOUtil.dic_to_json('offers.json', category[ 'id' ], local_offers)
def fill_gtin(self, offers, category): dic_gtin = IOUtil.json_to_dic('gtin.json', category ) print( "Requesting GTIN by offer...") # Função que busca e preenche cada GTIN def local_request_gtin(offer, dic_gtin): # Verifica se tem o GTIN armazendo locamente para aquele produto if offer['id'] in dic_gtin.keys(): # Armazena o GTIN gtin = dic_gtin[ offer['id'] ] else: # Tenta obter o GTIN na Web gtin = request_gtin( offer['names'], offer['brand'] ) # Verifica se GTIN foi encontrado if gtin is None: # Tenta pegar o GTIN de outras ofertas do mesmo produto product_id = offer['id'].split('_')[0] # Procura por ofertas do mesmo produto offer_id = None for key in dic_gtin.keys(): if product_id in key: offer_id = key break # Verifica se foi encontrado if offer_id is not None: # Pega o GTIN da oferta gtin = dic_gtin[ offer_id ] if gtin is not None: offer['gtin'] = gtin return offer # Requerindo o GTIN - paralelizado nc = mp.cpu_count() offers = Parallel(n_jobs=nc)(delayed(local_request_gtin) (offer, dic_gtin) for offer in tqdm(offers)) offers_without_gtin = [] for offer in reversed( offers ): # Armazenando o GTIN if 'gtin' in offer.keys(): # Armazena o GTIN na base local para uso futuro dic_gtin[ offer['id'] ] = offer['gtin'] else: # Armazena a oferta sem GTIN para busca manual offers_without_gtin.append( offer ) # Remove a oferta para não ser publicada offers.remove( offer ) # Atualizando a lista de GTINs IOUtil.dic_to_json( 'gtin.json', category, dic_gtin) # IOUtil.dic_to_json( 'offers_without_gtin.json', category, offers_without_gtin) print("Amount of offers without GTIN: ", len(offers_without_gtin)) return offers
def request_gtin(descriptions, brand): response = None try: for description in descriptions: url = "https://cosmos.bluesoft.com.br/pesquisar?utf8=%E2%9C%93&q={0}" url = url.format(description) # Realiza a mesma requisição no máximo 5x count_request = 0 while count_request < 5: response = requests.get(url) if response.status_code == 200: break count_request += 1 gtin = None try: soup = BeautifulSoup(response.content, "lxml", from_encoding="utf8") r = soup.find('body').find('div', id='container-principal') r = r.find('section', class_='col-md-6 col-lg-6 col-xs-12 main') r = r.find('div', id='results').find('div', class_='list-group').find('ul') r = r.find('li', class_='product-list-item item').find( 'div', class_='content') # Verificando se o produto é da mesma marca title = r.find('h5').find('a').get_text().lower() if brand.lower() in title or 'moto' in title: # Getting o GTIN gtin = r.find('ul').find_all('a')[-1].get_text() gtin = gtin.zfill(14) break except: pass except: pass if response is not None and response.status_code != 200: print("WARNING: GTIN site status_code = {0}".format( response.status_code)) print("URL: ", url, "Product description", descriptions[0]) if gtin is None: IOUtil.save_log("WARNING: GTIN not found = ({0}, {1})".format( brand, descriptions[0])) return gtin
def main(sellerApi, category): # Carregando as ofertas a serem adicionadas offers = IOUtil.json_to_dic('to_add.json', category['id']) # Verifica se tem alguma oferta if len(offers) == 0: print("No offer to add.") return # Buscando e preenchendo o GTIN if category['gtin']: offers = sellerApi.fill_gtin(offers, category['id']) # Processando as ofertas print("Processing to sent...") offers = sellerApi.process_add(offers) # Uploading a atualização dos dados para o site print("Updating the website...") upload_to_site() # Waiting for 1 minute to upload to Google print("Waiting for 2 minute to upload to Google...") time.sleep(120) # Enviando para o Shopping insert_batch.do(offers)
def get_offers_by_product_id(self, id, results=1): # TEMP: fazendo um gato com a api v1 # http://bws.buscape.com.br/service/findOfferList/lomadee/65717751673178504d42633d/BR/?sourceId=36029361&productid=657637&format=json&encode=utf-8&sort=price&results=10 url = "http://bws.buscape.com.br/service/findOfferList/lomadee/{0}/BR/?" url = url.format("65717751673178504d42633d") url = url + "{0}&{1}&".format("sourceId=36029361", "productid=" + str(id) ) url = url + "format=json&encode=utf-8&sort=price&results={0}".format( results ) # Pegando as ofertas json_data = IOUtil.request_json_from_url(url) # Especificações tecnicas specification = "" product = json_data['product'][0]['product'] # print( product ) if 'item' in product['specification'].keys(): for s in product['specification']['item']: specification += s['item']['label'] + ": " + s['item']['value'][0] + "\n" names = [ product['productname'] ] if 'productshortname' in product: names.append( product['productshortname'] ) offers = self._get_offers_from_json(json_data) for offer in offers: offer['names'].extend( names ) offer['names'].sort(key=len, reverse=True) offer['specification'] = specification return offers
def get_products_id(self, url, category=None, page=None, max_pages=None, return_cat_id=False): products = [] # pegando uma pagina de ofertas json_data = IOUtil.request_json_from_url(url.format(page=1 if page is None else page)) products = self._get_product_ids_from_json( json_data, category, return_cat_id ) # retornando somente a pagina solicitada if page is not None: return products page = 2 # quantidade de paginas a ser buscadas total_pages = int(json_data['pagination']['totalPage']) if max_pages is not None: total_pages = min(total_pages, max_pages) # função que busca, processa e retorna todos os produtos de uma pagina def process_batch(url, page): json_data = IOUtil.request_json_from_url( url.format(page=page) ) return self._get_product_ids_from_json( json_data, category, return_cat_id ) # Processando todas as paginas paralelamente nc = mp.cpu_count() results = Parallel(n_jobs=nc)(delayed(process_batch) (url, i) for i in range(page, total_pages + 1)) # Passando as ofertas para a lista de retorno for result in results: products.extend( result ) return products
def process_offers(self, new_offer_list, category): print("Processing the offers...") # Carregando as ofertas por categoria e criando os vetores local_offer_list = IOUtil.json_to_dic( 'offers.json', category[ 'id' ] ) offer_to_add, offer_to_delete, offer_to_update = [], [], [] print("\tStep 1/2") # Produtos que serão removidos for local_offer in tqdm( reversed(local_offer_list) ): # Se a oferta local não estiver mais disponivel, ela deve ser removida if self.find_offer(local_offer, new_offer_list) is None: # Verifica se a oferta já havia sido publicada if 'published_id' in local_offer.keys(): # Adicionando para deletar a publicação offer_to_delete.append( local_offer['published_id'] ) # Removendo a oferta da base local local_offer_list.remove( local_offer ) print("\tStep 2/2") # Produtos a serem inseridos e atualizados for new_offer in tqdm( new_offer_list ): # Buscando o produto na lista de publicados published_offer = self.find_offer(new_offer, local_offer_list) # Não tem esse produto publicado, então ele será publicado e salvo na base local if published_offer is None: offer_to_add.append( new_offer ) local_offer_list.append( new_offer ) # Tem o produto publicado, mas o mesmo possui updates elif self.has_updates( published_offer, new_offer ): # Removendo a oferta da base local local_offer_list.remove( published_offer ) # Verifica se a oferta possui o id de publicação if 'published_id' in published_offer.keys(): # Adicionando oferta na lista de atualização new_offer['published_id'] = published_offer['published_id'] offer_to_update.append( new_offer ) # Adicionado a oferta atualizada na base local local_offer_list.append( new_offer ) # Salvando as ofertas locamente IOUtil.dic_to_json( 'offers.json', category[ 'id' ], local_offer_list ) print("offer_to_add: ", len(offer_to_add)) # Retonas as listas return offer_to_add, offer_to_delete, offer_to_update
def main(sellerApi, category): offers = IOUtil.json_to_dic( 'to_update.json', category['id'] ) if len(offers) == 0: print( "No offer to update.") return sellerApi.process_delete( offers ) offers = sellerApi.process_add( offers ) print("Updating offers....") update.do( offers )
def get_categories_by_store(page=None, max_pages=None): categories = {} # pegando uma pagina de ofertas json_data = IOUtil.request_json_from_url(url.format(page=1 if page is None else page)) process_json( json_data, categories ) # retornando somente a pagina solicitada if page is not None: return categories page = 2 # quantidade de paginas a ser buscadas total_pages = int(json_data['pagination']['totalPage']) if max_pages is not None: total_pages = min(total_pages, max_pages) for i in tqdm(range(page, total_pages)): json_data = IOUtil.request_json_from_url( url.format(page=i) ) process_json( json_data, categories ) return categories
def main(category): offers = IOUtil.json_to_dic('to_delete.json', category['id']) if len(offers) == 0: print("No offer to delete.") return # Uploading a atualização dos dados para o site print("Updating the website...") upload_to_site() print("Deleting {0} offers...".format(len(offers))) delete_batch.do(offers)
def find_offer(self, item, list_of_item, field="id"): def _p(s): return s.strip().lower() # Lisa para armazenar os resultados result_find = [] # Faz a busca do item for i in list_of_item: s_1 = _p(i[field]) s_2 = _p(item[field]) if (s_1 in s_2) or (s_2 in s_1): result_find.append( i ) IOUtil.save_log("Olha isso: " + s_1 + ", " + s_2) # Se mais de um produto foi encontrado, então tem alguma coisa errada if len(result_find) > 1: IOUtil.save_log("BUG: more than one product {0} with the same {1}: ".format( len(result_find), field ) + item['id']) # Retorna o produto if len(result_find) == 0: return None return result_find[0]
def brands_updates(self, _categories=None, to_json=True): url = self._get_url('category/_id/{0}') dic_brands = defaultdict(list) # categories_filter = [] if _categories is None: _categories = categories # percorre todas as categorias listadas print("Searching brands by category") for category in tqdm(_categories): # buscas as informações da categoria # print("\tGetting Brands of Category '" + category['name'] + "'") json_data = IOUtil.request_json_from_url(url.format(category[ 'id' ])) # pega somente a lista de marcas json_filters = json_data['categories'][0]['filters'] # Check if has filters if len(json_filters) == 0: continue # categories_filter.append( category ) json_brands = json_filters[0]['options'] list_brands = [] # faz uma lista com as marcas e os ids for brand in json_brands: list_brands.append(brand) # salva essa lista em um dict dic_brands[str(category[ 'id' ])] = list_brands # print( "Categories without brands: ", count_no_brand) # gerar um json com todas as marcas dic_brands = self.remove_brands(dic_brands) if to_json: IOUtil.dic_to_json('brands.json', None, dic_brands) # IOUtil.dic_to_json('categories_filter.json', None, categories_filter) return dic_brands
def get_offers(self, category): print("Searching Offers...") offers = [] params = ["filters={filters}", "page={page}"] url = self._get_url('product/_category/{cat}', size=100, request_params=params) dict_brands = IOUtil.json_to_dic('brands.json', None ) print("\tGetting Offers from Category: ", category['name']) actual_len = len(offers) brands = dict_brands[ str(category['id']) ] print("\tLooking in {0} Brands.".format(len(brands))) def __product_offers(id, brand): # Getting offers offers_temp = self.get_offers_by_product_id(id) # Filling Brand for offer in offers_temp: offer['brand'] = brand return offers_temp # Loop por marcas desta categorias n_brands = len(brands) for i in range( n_brands ): brand = brands[ i ] print("\t\tBrand[{0}/{1}]: {2}".format(i, n_brands, brand['name'])) print( "\t\tGetting products ids...") url_cat = url.format(cat=category[ 'id' ], filters=brand['id'], page="{page}") products_id = self.get_products_id(url_cat, category) # Loop pelos produtos daquela categoria e daquela marca print( "\t\tGetting offers from products ids...") nc = mp.cpu_count() list_list_offers = Parallel(n_jobs=nc)(delayed(__product_offers) (id, brand['name']) for id in tqdm(products_id) ) for list_offers in list_list_offers: offers.extend( list_offers ) print("\tAumont of '{0}' found: {1}.".format( category['name'], len(offers) - actual_len)) print("\tAll found offers: {0}.\n".format(len(offers))) return offers
def main(sellerApi, category): timer = Timer() timer.start() if category['id'] is None: print("Top offers...") offers = sellerApi.get_top_offers() else: offers = sellerApi.get_offers(category) print("Time elapsed to get {0} produts: ".format(len(offers)), timer.diff()) timer.start() adds, deletes, updates = sellerApi.process_offers(offers, category) print("Time elapsed to process the products: ", timer.diff()) print("Produts to add: ", len(adds)) print("Produts to update: ", len(updates)) print("Produts to delete: ", len(deletes)) # Salvando os ultimos produtos enviados para o csv IOUtil.dic_to_json('to_add.json', category['id'], adds) IOUtil.dic_to_json('to_update.json', category['id'], updates) IOUtil.dic_to_json('to_delete.json', category['id'], deletes)
def process_batch(url, page): json_data = IOUtil.request_json_from_url( url.format(page=page) ) return self._get_product_ids_from_json( json_data, category, return_cat_id )
from io_util import IOUtil song_list = IOUtil.read_song_list() print(song_list)
def save_music(music, music_name): path = IOUtil.get_path() f = open(path + '/' + music_name, "wb") f.write(music) f.close() print('保存完毕:' + music_name + ' ' + path)
def process_delete(self, offers): for offer in offers: try: os.remove(local_folder + image_folder + "/" + str(offer['id']) + ".jpg") except: IOUtil.save_log("WARNING: " + str(offer['id']) + ".jpg" + " not found.")
from joblib import Parallel, delayed import multiprocessing as mp from tqdm import tqdm from site_api import image_folder, site_url, local_folder from util import download_file from io_util import IOUtil import os from GTIN import request_gtin from util import str_utf8 categories = IOUtil.json_to_dic('categories.json', None) APP_TOKEN = '1547052034937b19ab87d' SOURCE_ID = '36029361' class SellerAPI: def __init__(self): self.env = "api" def set_test_env(self): self.env = "sandbox-api" def set_production_env(self):
from io_util import IOUtil from tqdm import tqdm import os from site_api import upload_to_site DELETE_ALL = True if DELETE_ALL: files = glob("./Data/*") print("Number of files data to clean: ", len(files)) for file in tqdm(files): if 'pem' not in file and 'gtin' not in file: if '.json' in file: if 'offers' in file: IOUtil.dic_to_json(file, None, [], change_path=False) else: IOUtil.dic_to_json(file, None, {}, change_path=False) else: os.remove(file) files = glob("./woow/ImagesProducts/*") print("Number of images to remove: ", len(files)) for file in tqdm(files): os.remove(file) list_products = list.do() print("Number of published offers: ", len(list_products)) delete_list = [] for product in list_products:
from seller_api import SellerAPI from io_util import IOUtil api = SellerAPI() offers = api.get_top_offers() IOUtil.dic_to_json('test.json', None, offers)