class DataExtractor: def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() self.storages = Storages() def parse(self): data = {} r = self.response # produtos da shoptime data['store'] = 'shoptime' # nome do produto try: data['name'] = r.find('h1', {'id': 'main-product-name'}) data['name'] = self.validate_field(data, 'name') data['name'] = re.sub(re.compile(u' \(Cód\. ([0-9])+\)'), '', data['name']) except (ValueError, TypeError, AttributeError): data['name'] = '' # url como variavel global da classe data['url'] = self.url # preço do produto try: data['price'] = r.find( 'span', {'data-partner-value': True})['data-partner-value'] data['price'] = self.normalize_price(data['price']) except (ValueError, TypeError, AttributeError): data['price'] = 0.0 # disponibilidade try: data['available'] = self.set_available(r, data['price']) except (ValueError, TypeError, AttributeError): data['available'] = False try: # processador data['processor'] = r.find( 'th', text=re.compile(r'Processador')).parent.find('td').text data['processor'] = self.normalize_processor(data['processor']) except (ValueError, TypeError, AttributeError): data['processor'] = '' # marca try: data['brand'] = r.find( 'th', text=re.compile(r'Marca')).parent.find('td').text data['brand'] = (self.normalize_brand(data['name'])).strip() except (ValueError, TypeError, AttributeError): data['brand'] = '' # memória ram try: data['ram_memory'] = r.find( 'th', text=re.compile(u'Memória RAM')).parent.find('td').text except (ValueError, TypeError, AttributeError): data['ram_memory'] = '' # sku para identificação try: data['sku'] = r.find('div', {'data-sku': True})['data-sku'] except (ValueError, TypeError, AttributeError): data['sku'] = '' # armazenamento (SSD/HD) try: try: hd = r.find('th', text=re.compile(r'HD')).parent.find('td').text except (ValueError, TypeError, AttributeError): hd = '' try: ssd = r.find('th', text=re.compile(r'SSD')).parent.find('td').text except (ValueError, TypeError, AttributeError): ssd = '' data['storage'] = self.normalize_storage(hd, ssd) except (ValueError, TypeError, AttributeError): data['storage'] = self.normalize_storage('', '') # tamanho da tela try: data['display_size'] = self.normalize_display_size( r.find('th', text=re.compile( r'Polegadas da Tela')).parent.find('td').text) except (ValueError, TypeError, AttributeError): data['display_size'] = '' try: data['img_url'] = (r.findAll( 'img', {'class': 'p-image'})[0]['src']).strip() except (ValueError, TypeError, AttributeError, IndexError): data['img_url'] = '' return data def set_available(self, response, price): meta = response.find('div', {'class': 'product-unavailable'}) if (meta == None) and (price != 0.0): return True else: return False def validate_field(self, data, field): if data[field] != None: return (data[field].get_text().strip() if (len(data[field]) > 0) else '') def normalize_display_size(self, text): if text != None and len(text) > 0: return (text.strip(" \"").replace(",", ".") + "\"").strip() else: return '' def normalize_storage(self, hd, ssd): result = '' if hd != None and len(hd) > 0: result = re.search('\d+.+[TG]B', hd) if result != None: return self.get_storage_capacity(result.group()) if ssd != None and len(ssd) > 0: result = re.search('\d+.+[TG]B', ssd) if result != None: return self.get_storage_capacity(result.group()) def normalize_memory(self, raw_data): if re.search('16', raw_data, re.IGNORECASE) != None: return self.memory.get_16GB() elif re.search('12', raw_data, re.IGNORECASE) != None: return self.memory.get_12GB() elif re.search('14', raw_data, re.IGNORECASE) != None: return self.memory.get_14GB() elif re.search('10', raw_data, re.IGNORECASE) != None: return self.memory.get_10GB() elif re.search('8', raw_data, re.IGNORECASE) != None: return self.memory.get_8GB() elif re.search('6', raw_data, re.IGNORECASE) != None: return self.memory.get_6GB() elif re.search('4', raw_data, re.IGNORECASE) != None: return self.memory.get_4GB() elif re.search('2', raw_data, re.IGNORECASE) != None: return self.memory.get_2GB() elif re.search('1', raw_data, re.IGNORECASE) != None: return self.memory.get_1GB() def normalize_price(self, raw_data): try: # transforma 1.000, 00 em 1000.00 raw_data = (raw_data if len(raw_data) > 0 else '') price = float(raw_data) price = round(price, 2) return price except ValueError: return 0.0 def normalize_brand( self, raw_data ): # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"] if re.search('dell', raw_data, re.IGNORECASE) != None: return self.brands.get_dell() elif re.search('asus', raw_data, re.IGNORECASE) != None: return self.brands.get_asus() elif re.search('apple', raw_data, re.IGNORECASE) != None: return self.brands.get_apple() elif re.search('acer', raw_data, re.IGNORECASE) != None: return self.brands.get_acer() elif re.search('samsung', raw_data, re.IGNORECASE) != None: return self.brands.get_samsung() elif re.search('positivo', raw_data, re.IGNORECASE) != None: return self.brands.get_positivo() elif re.search('lenovo', raw_data, re.IGNORECASE) != None: return self.brands.get_lenovo() elif re.search('lg', raw_data, re.IGNORECASE) != None: return self.brands.get_lg() elif re.search('compaq', raw_data, re.IGNORECASE) != None: return self.brands.get_compaq() elif re.search('seagate', raw_data, re.IGNORECASE) != None: return self.brands.get_seagate() elif re.search('gigabyte', raw_data, re.IGNORECASE) != None: return self.brands.get_gigabyte() elif (re.search('hp', raw_data, re.IGNORECASE) != None): return self.brands.get_hp() elif (re.search('sony', raw_data, re.IGNORECASE) != None): return self.brands.get_sony() def normalize_processor(self, raw_data): raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if re.search('i3', raw_data, re.IGNORECASE) != None: return self.processors.get_i3() elif re.search('i5', raw_data, re.IGNORECASE) != None: return self.processors.get_i5() elif re.search('i7', raw_data, re.IGNORECASE) != None: return self.processors.get_i7() elif re.search('Pentium', raw_data, re.IGNORECASE) != None: return self.processors.get_pentium_quad() elif re.search('byt|baytrail', raw_data, re.IGNORECASE) != None: return self.processors.get_baytrail() elif re.search('amd.+dual core', raw_data, re.IGNORECASE) \ != None: return self.processors.get_amd_dual() elif re.search('atom', raw_data, re.IGNORECASE) != None: return self.processors.get_atom() elif re.search('Intel.+Core.+M', raw_data, re.IGNORECASE) \ != None: return self.processors.get_core_m() elif re.search('Celeron', raw_data, re.IGNORECASE) != None: return self.processors.get_celeron() elif re.search('arm', raw_data, re.IGNORECASE) != None: return self.processors.get_arm_a9() elif re.search('samsung', raw_data, re.IGNORECASE) != None: return self.processors.get_samsung() # normalização de capacidade def get_storage_capacity(self, raw_data): if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_2tb() elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_1tb() elif (re.search('750 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_750() elif (re.search('640 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_640() elif (re.search('500 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_500() elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_320() elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_256() elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_160() elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_128() elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_80() elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_64() elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_32() elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_16()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() self.storages = Storages() #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url} # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url def parse(self): data = {} # produtos das casas bahia data['store'] = "casas_bahia" # nome do produto data['name'] = self.response.findAll("b", {"itemprop": "name"}) data['name'] = self.validate_field(data, 'name') # url como variavel global da classe data['url'] = self.url # preco do produto data['price'] = self.response.findAll("i", {"class": "sale price"}) data['price'] = self.normalize_price(data['price']) # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel data['available'] = data['price'] != None and data['price'] != 0.0 data['img_url'] = self.response.findAll('img', {'itemprop': 'image'}) data['img_url'] = self.normalize_img_url(data['img_url']) # processador data['processor'] = self.response.findAll("", {"class": "Processador"}) data['processor'] = self.normalize_processor( self.validate_field(data, 'processor')) # marca data['brand'] = self.normalize_brand(data['name']) # memoria ram data['ram_memory'] = self.response.findAll("dl", {"class": "Memoria-RAM"}) data['ram_memory'] = self.normalize_memory( self.validate_field(data, 'ram_memory')) # sku para identificacao data['sku'] = self.url.split('?')[0].split('-')[-1].split('.')[0] # armazenamento (SSD/HD) hd = self.response.findAll( "dl", {"class": ["Disco-rigido--HD-", "Memoria-Flash--SSD-"]}) data['storage'] = self.normalize_storage(hd) # tamanho de tela data['display_size'] = self.response.findAll( "dl", {"class": "Tamanho-da-tela"}) data['display_size'] = data['display_size'][0].find( 'dd').get_text().strip() if (len(data["display_size"]) > 0) else "" return data def validate_field(self, data, field): return (data[field][0].get_text().strip() if (len(data[field]) > 0) else "") def normalize_img_url(self, img_url): return img_url[0]['src'] if (len(img_url) > 0) else None def normalize_storage(self, hd): if (len(hd) > 0): hd = hd[0].find('dd').get_text() result = '' if hd != None and len(hd) > 0: result = re.search('\d+.+[TG]B', hd) if result != None: result = self.get_storage_capacity(result.group()) return result def normalize_memory(self, raw_data): if (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('12GB|12 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('14GB|14 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('10GB|10 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8GB|8 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6GB|6 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4GB|4 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2GB|2 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1GB|1 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() # transforma 1.000,00 em 1000.00 def normalize_price(self, raw_data): try: raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else "" raw_data = raw_data.replace('.', '').replace(',', '.') return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): if (re.search('dell', raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() elif (re.search('hp', raw_data, re.IGNORECASE) != None): return self.brands.get_hp() elif (re.search('sony', raw_data, re.IGNORECASE) != None): return self.brands.get_sony() # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron'] def normalize_processor(self, raw_data): # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search('Intel.+[Dd]ual [Cc]ore', raw_data, re.IGNORECASE) != None): return self.processors.get_intel_dual() elif (re.search('Intel.+[Qq]uad [Cc]ore', raw_data, re.IGNORECASE) != None): return self.processors.get_intel_quad() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung() # normalização de capacidade def get_storage_capacity(self, raw_data): if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_2tb() elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_1tb() elif (re.search('750 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_750() elif (re.search('640 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_640() elif (re.search('500 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_500() elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_320() elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_256() elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_160() elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_128() elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_80() elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_64() elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_32() elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_16()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors(); self.brands = Brands(); self.memory = Memory(); def parse(self): namespaces = {'re': "http://exslt.org/regular-expressions"} data = {} data["store"] = "eletro_shopping" data["name"] = self.response.xpath('//div[@itemprop="name"]/h1//text()') data["name"] = self.validate_field(data, 'name') data["url"] = self.url data["price"] = self.response.xpath('//span[@itemprop="lowPrice"]//text()') data["price"] = self.normalize_price(data["price"]) data["available"] = data["price"] != None and data["price"] != 0.0 data["processor"] = self.response.xpath('//td[text()="Processador - Modelo"]/following-sibling::td//text()') data["processor"] = self.normalize_processor(self.validate_field(data, "processor")) data["brand"] = self.normalize_brand(data["name"]) data["ram_memory"] = self.response.xpath('//td[contains(text(), "Mem") and contains(text(), "Capacidade")]/following-sibling::td//text()', namespaces = {'re' : 'http://exslt.org/regular-expressions'}) data["ram_memory"] = self.normalize_memory(self.validate_field(data, "ram_memory")) data["sku"] = self.response.xpath('//div[@itemprop="name"]/span//text()') if(len(data['sku']) > 0): data["sku"] = data["sku"][0].split(':')[1].strip() else: data['sku'] = '' data["disco"] = {} disco = self.response.xpath('//td[text()="HD - Tipo"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"}) hd = '' ssd = '' if(len(disco) > 0): disco = disco[0] if(re.search('HD', disco, re.IGNORECASE) != None and re.search('SSD', disco, re.IGNORECASE) != None): aux = self.response.xpath('//td[text()="HD - Capacidade"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"}) if(len(aux) > 0): aux = aux[0]; if(re.search('\\+', aux) != None): hd = aux.split('+')[0] ssd = aux.split('+')[1] else: hd = disco.split('HD')[0].strip(); ssd = disco.split(' ') ssd = ssd[ssd.index('SSD') - 1] elif(re.search('HD', disco, re.IGNORECASE) != None): hd = self.response.xpath('//td[text()="HD - Capacidade"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"}) ssd = '' elif(re.search('SSD', disco, re.IGNORECASE) != None): hd = '' ssd = self.response.xpath('//td[text()="HD - Capacidade"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"}) data['disco']['hd'] = hd data['disco']['ssd'] = ssd data["display_size"] = self.response.xpath('//td[text()="Tela - Tamanho"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"}) data["display_size"] = self.validate_field(data, "display_size") return data def validate_field(self, data, field): return (data[field][0].strip() if (len(data[field]) > 0) else "") # normalize storage def normalize_memory(self, raw_data): if (re.search('16', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('14', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('12', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('10', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() def normalize_price(self, raw_data): try: raw_data = raw_data[0].strip() if (len(raw_data) > 0) else "" raw_data = raw_data.replace(".", "").replace(",", ".") return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): if (re.search("dell", raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url} # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url def parse(self): data = {} r = self.response # produtos das casas bahia data['store'] = "ibyte_computadores" # nome do produto data['name'] = self.response.findAll("li", {"class": "product"}) data['name'] = self.validate_field(data, 'name') # url como variavel global da classe data['url'] = self.url # nome do produto data['price'] = self.response.find("div", { "class": "preco-produto" }).parent.findAll('span', {"class": "price"}) data['price'] = self.normalize_price(data['price']) # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel data['available'] = data['price'] != None and data['price'] != 0.0 try: # processador data['processor'] = r.find( 'td', text=re.compile(r'Processador:')).parent.find( 'td', { 'width': '570' }).text data['processor'] = self.normalize_processor(data['processor']) except (ValueError, TypeError, AttributeError): data['processor'] = '' # marca try: data['brand'] = r.find('td', text=re.compile(r'Marca:')).parent.find( 'td', { 'width': '570' }).text.strip() except (ValueError, TypeError, AttributeError): data['brand'] = '' # memória ram try: data['ram_memory'] = r.find( 'td', text=re.compile(u'Memória RAM:')).parent.find( 'td', { 'width': '570' }).text.strip() data['ram_memory'] = self.normalize_memory(data['ram_memory']) except (ValueError, TypeError, AttributeError): data['ram_memory'] = '' # sku para identificação try: data['sku'] = r.find('div', { 'class': 'product-essential' }).parent.find('h3').text.split(' ')[-1].split(')')[0] except (ValueError, TypeError, AttributeError): data['sku'] = '' # armazenamento (SSD/HD) try: try: hd = r.find('td', text=re.compile(r'HD:')).parent.find( 'td', { 'width': '570' }).text except (ValueError, TypeError, AttributeError): hd = '' try: ssd = r.find('td', text=re.compile(r'SSD:')).parent.find( 'td', { 'width': '570' }).text except (ValueError, TypeError, AttributeError): ssd = '' data['storage'] = self.normalize_storage(hd, ssd) except (ValueError, TypeError, AttributeError): data['storage'] = {} # tamanho da tela try: data['display_size'] = r.find( 'td', text=re.compile(r'Polegadas da Tela:')).parent.find( 'td', { 'width': '570' }).text.strip() except (ValueError, TypeError, AttributeError): data['display_size'] = '' #imagem do produto try: data['img_url'] = (r.findAll('img', {'id': 'image'})[0]['src']).strip() except (ValueError, TypeError, AttributeError, IndexError): data['img_url'] = '' return data def validate_field(self, data, field): return (data[field][0].get_text().strip() if (len(data[field]) > 0) else "") def normalize_storage(self, hd, ssd): result = {} if hd != None and len(hd) > 0: result["HD"] = re.search('\d+TB', hd) if result["HD"] != None: result["HD"] = result["HD"].group() if ssd != None and (len(ssd) > 0) and result == None: result["SSD"] = re.search('\d+TB', ssd) if result["SSD"] != None: result["SSD"] = result["SSD"].group() return result def normalize_memory(self, raw_data): if (re.search('16', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('12', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('14', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('10', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() def normalize_price(self, raw_data): try: # transforma 1.000,00 em 1000.00 raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else "" raw_data = raw_data.replace('.', '').replace(',', '.') raw_data = raw_data.replace('R$', '').replace(',', '.') return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"] if (re.search('dell', raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron'] # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() self.storages = Storages() #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url} # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url def parse(self): data = {} r = self.response # produtos das casas bahia data['store'] = "mega_eletronicos" # nome do produto try: data['name'] = r.find('div', { 'class': 'col-md-12 col-sm-12' }).parent.find('h1').text data['name'] = self.validate_field(data, 'name') except (ValueError, TypeError, AttributeError): data['name'] = '' # url como variavel global da classe data['url'] = self.url # preço do produto try: data['price'] = r.find('h3', { 'class': 'real' }).parent.find('span').text data['price'] = self.normalize_price(data['price']) except (ValueError, TypeError, AttributeError): data['price'] = 0.0 # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel data['available'] = data['price'] != None and data['price'] != 0.0 try: # processador data['processor'] = r.find( 'td', text=re.compile(r'(Processor|Processador|Intel)')).parent.find( 'td', { 'width': '65%' }).text data['processor'] = self.normalize_processor(data['processor']) except (ValueError, TypeError, AttributeError): data['processor'] = '' # marca try: data['brand'] = r.find('div', { 'class': 'col-md-12 col-sm-12' }).parent.find('h1').text data['brand'] = self.normalize_brand(data['brand']) except (ValueError, TypeError, AttributeError): data['brand'] = '' # memória ram try: data['ram_memory'] = r.find( 'td', text=re.compile(r'(DDR|SDRAM|RAM)')).parent.find( 'td', { 'width': '65%' }).text data['ram_memory'] = self.normalize_memory(data['ram_memory']) except (ValueError, TypeError, AttributeError): data['ram_memory'] = '' # sku para identificacao data['sku'] = self.url.split('?')[0].split('-')[-1].split('.')[0] # armazenamento (SSD/HD) try: data['storage'] = self.normalize_storage( r.find('td', text=re.compile(r'(rpm|HDD|SSD)')).parent.find( 'td', { 'width': '65%' }).text) except (ValueError, TypeError, AttributeError): data['storage'] = '' # tamanho da tela try: #data['display_size'] = r.find('td', text=re.compile(r'(LED|LCD)')).parent.find('td', {'width': '65%'}).text #data['display_size'] = re.sub(r'([a-z])', '', data['display_size']) #data['display_size'] = re.sub(r'([A-Z])', '', data['display_size']) # comentei tudo pois não consegui formalizar os tamanhos da tela data['display_size'] = '' except (ValueError, TypeError, AttributeError): data['display_size'] = '' return data def validate_field(self, data, field): return (data[field][0].get_text().strip() if (len(data[field]) > 0) else "") def normalize_storage(self, hd): result = '' if hd != None and len(hd) > 0: result = re.search('\d+.+[TG]B', hd) if result != None: result = result.group() return self.get_storage_capacity(result) def normalize_memory(self, raw_data): if (re.search('16', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('12', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('14', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('10', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() def normalize_price(self, raw_data): try: # transforma 1.000,00 em 1000.00 #raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else "" raw_data = raw_data.replace('.', '').replace(',', '.') return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"] if (re.search('dell', raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron'] # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("amd.+quad core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_quad() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung() # normalização de capacidade def get_storage_capacity(self, raw_data): if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_2tb() elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_1tb() elif (re.search('750 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_750() elif (re.search('640 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_640() elif (re.search('500 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_500() elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_320() elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_256() elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_160() elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_128() elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_80() elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_64() elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_32() elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_16()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url} # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url def parse(self): data = {} # produtos das casas bahia data['store'] = "submarino" # nome do produto data['name'] = self.self.response.xpath('/html/body/div[5]/section/div/div/div[2]/div/div[1]/h1/span/text()')[0].strip() data['name'] = self.validate_field(data, 'name') # url como variavel global da classe data['url'] = self.url # # preco do produto data['price'] = self.response.findAll("span", {"itemprop": "price/salesPrice"}) data['price'] = self.normalize_price(data['price']) # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel data['available'] = data['price'] != None and data['price'] != 0.0 # processador data['processor'] = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[6]/td/text()') data['processor'] = self.normalize_processor(self.validate_field(data, 'processor')) # marca data['brand'] = self.normalize_brand(data['name']) # memoria ram data['ram_memory'] = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[9]/td/text()')[0].strip() data['ram_memory'] = self.normalize_memory(self.validate_field(data, 'ram_memory')) # sku para identificacao data['sku'] = self.url.split('/')[2].split('/')[-1] # # armazenamento (SSD/HD) ------ observar busca por 'SSD' hd = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[11]/td/text()')[0].strip() # ssd = self.reponse.findAll("SSD") # if (ssd) ssd = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[10]/td/text()')[0].strip # elif # ssd = "" data['storage'] = self.normalize_storage(hd, ssd) # tamanho de tela data['display_size'] = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[4]/td/text()')[0].strip() data['display_size'] = data['display_size'][0].find('dd').get_text().strip() if (len(data["display_size"]) > 0) else "" return data def validate_field(self, data, field): return (data[field][0].get_text().strip() if (len(data[field]) > 0) else "") def normalize_storage(self, hd, ssd): if (len(hd) > 0): hd = hd[0].find('dd').get_text() if (len(ssd) > 0): ssd = ssd[0].find('dd').get_text() result = '' if hd != None and len(hd) > 0: result = re.search('\d+.+[TG]B', hd) if result != None: return self.get_storage_capacity(result.group()) if ssd != None and len(ssd) > 0: result = re.search('\d+.+[TG]B', ssd) if result != None: return self.get_storage_capacity(result.group()) def normalize_memory(self, raw_data): if (re.search('16', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('12', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('14', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('10', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() def normalize_price(self, raw_data): try: # transforma 1.000,00 em 1000.00 raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else "" raw_data = raw_data.replace('.', '').replace(',', '.') return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"] if (re.search('dell', raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron'] # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung() def get_storage_capacity(self, raw_data): if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_2tb() elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None): return self.storages.get_1tb() elif (re.search('750 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_750() elif (re.search('640 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_640() elif (re.search('500 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_500() elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_320() elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_256() elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_160() elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_128() elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_80() elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_64() elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_32() elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None): return self.storages.get_16()
class DataExtractor(): DOWNLOAD_DELAY = 5 def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() def parse(self): namespaces = {'re': "http://exslt.org/regular-expressions"} data = {} data["store"] = "novo_mundo" data["name"] = self.response.xpath( '//div[@class="productName"]//text()') print self.response.xpath('//div[@class="productName"]') data['name'] = self.validate_field(data, 'name') data['url'] = self.url data['price'] = self.response.xpath( '//strong[@class="skuBestPrice"]//text()')[0].split(' ')[1] data['price'] = self.normalize_price(data['price']) data["available"] = data["price"] != None and data["price"] != 0.0 data['processor'] = self.response.xpath( '//td[@class="value-field" and @class="Processador"]//text()') data['processor'] = self.normalize_processor( self.validate_field(data, 'processor')) data['brand'] = self.normalize_brand(data['name']) data['ram_memory'] = self.response.xpath( '//td[@class="value-field" and @class="Memoria"]//text()') data['ram_memory'] = self.normalize_memory( self.validate_field(data, 'ram_memory')) data['sku'] = self.response.xpath( '//div[@class="skuReference"]//text()')[0] data['sku'] = self.validate_field(data, 'sku') hd = self.response.xpath( '//td[@class="value-field" and @class="HD"]//text()')[0] data['display_size'] = self.response.xpath( '//td[@class="value-field" and @class="Tela"]//text()') data['display_size'] = data['display_size'].split('\n')[0].split( ' ')[1] data['display_size'] = self.validate_field(data, 'display_size') return data def validate_field(self, data, field): return (data[field][0].strip() if (len(data[field]) > 0) else "") # normalize storage def normalize_memory(self, raw_data): if (re.search('16', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('14', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('12', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('10', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() def normalize_price(self, raw_data): try: raw_data = raw_data[0].strip() if (len(raw_data) > 0) else "" raw_data = raw_data.replace(".", "").replace(",", ".") return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): if (re.search("dell", raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() def parse(self): namespaces = {'re': "http://exslt.org/regular-expressions"} data = {} data["store"] = "eletrosom" data['name'] = self.response.xpath( '//div[@class="about"]/div[@class="meta"]/h1//text()') data['name'] = self.validate_field(data, 'name') data['url'] = self.url data['price'] = self.response.xpath( '//span[@class="regular-price"]/span/strong//text()')[0] data['price'] = data['price'].split('$')[1] data['price'] = self.normalize_price(data['price']) if not self.response.xpath('//div[@class="indisponivel"]'): data['available'] = True else: data['available'] = False data['processor'] = self.response.xpath( '//td[text()="Processador"]/following-sibling::td//text()') data["processor"] = self.normalize_processor( self.validate_field(data, "processor")) data['brand'] = self.normalize_brand(data['name']) data['ram_memory'] = self.response.xpath( '//td[contains(text(), "Mem") and contains(text(), "ria") or contains(text(), "RAM")]/following-sibling::td//text()', namespaces={'re': 'http://exslt.org/regular-expressions'}) data["ram_memory"] = self.normalize_memory( self.validate_field(data, "ram_memory")) data['sku'] = self.response.xpath('//p[@class="code"]//text()') data['sku'] = data['sku'][0].split(':')[1].split('/')[0].strip() data["disco"] = {} disco = self.response.xpath( '//td[contains(text(), "Disco") and contains(text(), "gido")]//text()' )[0] if disco: if (re.search('ssd', disco, re.IGNORECASE) != None): ssd = self.response.xpath( '//td[contains(text(), "Disco") and contains(text(), "gido")]/following-sibling::td//text()' ) hd = '' else: ssd = '' hd = self.response.xpath( '//td[contains(text(), "Disco") and contains(text(), "gido")]/following-sibling::td//text()' ) data['disco']['hd'] = hd data['disco']['ssd'] = ssd if self.response.xpath('//td[text()="Tela"]'): data["display_size"] = self.response.xpath( '//td[text()="Tela"]/following-sibling::td//text()') else: data["display_size"] = self.response.xpath( '//td[contains(text(), "Tamanho") and contains(text(), "Tela")]/following-sibling::td//text()' ) data["display_size"] = self.validate_field(data, "display_size") return data def validate_field(self, data, field): return (data[field][0].strip() if (len(data[field]) > 0) else "") # normalize storage def normalize_memory(self, raw_data): if (re.search('1GB|1 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() elif (re.search('2GB|2 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('4GB|4 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('6GB|6 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('8GB|8 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('10GB|10 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('12GB|12 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('14GB|14 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() def normalize_price(self, raw_data): try: # raw_data = raw_data[0].strip() if (len(raw_data) > 0) else "" raw_data = raw_data.replace(".", "").replace(",", ".") return float(raw_data) except ValueError: return 0.0 def normalize_brand(self, raw_data): if (re.search("dell", raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung()
class DataExtractor(): def __init__(self, response, url): self.response = response self.url = url self.processors = Processors() self.brands = Brands() self.memory = Memory() #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url} # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url def parse(self): data = {} # produtos da havan data['store'] = "havan" # nome do produto data['name'] = self.response.findAll("", {"class": "product-qd-v1-name"}) data['name'] = self.validate_field(data, 'name') # url como variavel global da classe data['url'] = self.url # preco do produto data['price'] = self.response.findAll("", {"class": "skuBestPrice"}) data['price'] = self.normalize_price(data['price']) # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel data['available'] = data['price'] != None and data['price'] != 0.0 # processador data['processor'] = self.response.findAll( "", {"class": "value-field Modelo-do-Processador"}) data['processor'] = self.normalize_processor( self.validate_field(data, 'processor')) # marca data['brand'] = self.normalize_brand(data['name']) # memoria ram data['ram_memory'] = self.response.findAll( "td", {"class": "value-field Memoria-Fisica-Disponivel"}) data['ram_memory'] = self.normalize_memory( self.validate_field(data, 'ram_memory')) # sku para identificacao data['sku'] = self.response.findAll("", {"class": "skuReference"}) data['sku'] = self.validate_field(data, 'sku') # armazenamento (SSD/HD) data['storage'] = self.response.findAll( "", {"class": "value-field Capacidade-do-Disco-Rigido-HD-"}) data['storage'] = self.validate_field(data, 'storage') #ssd = self.response.findAll("", {"class": "value-field Unidade-de-Estado-Solida-SSD-"}) # tamanho de tela display = self.response.findAll( "td", {"class": "value-field Tamanho-da-Tela"}) data['display_size'] = self.normalize_display(display) if ( len(display) > 0) else "" return data def validate_field(self, data, field): return (data[field][0].get_text().strip() if (len(data[field]) > 0) else "") def normalize_memory(self, raw_data): if (re.search('16', raw_data, re.IGNORECASE) != None): return self.memory.get_16GB() elif (re.search('12', raw_data, re.IGNORECASE) != None): return self.memory.get_12GB() elif (re.search('14', raw_data, re.IGNORECASE) != None): return self.memory.get_14GB() elif (re.search('10', raw_data, re.IGNORECASE) != None): return self.memory.get_10GB() elif (re.search('8', raw_data, re.IGNORECASE) != None): return self.memory.get_8GB() elif (re.search('6', raw_data, re.IGNORECASE) != None): return self.memory.get_6GB() elif (re.search('4', raw_data, re.IGNORECASE) != None): return self.memory.get_4GB() elif (re.search('2', raw_data, re.IGNORECASE) != None): return self.memory.get_2GB() elif (re.search('1', raw_data, re.IGNORECASE) != None): return self.memory.get_1GB() def normalize_price(self, raw_data): try: # transforma 1.000,00 em 1000.00 raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else "" raw_data = raw_data.replace('.', '').replace(',', '.').replace('R$', '') return float(raw_data) except ValueError: return 0.0 def normalize_display(self, raw_data): try: raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else "" raw_data = raw_data.replace('"', '') return raw_data except ValueError: return "" def normalize_brand(self, raw_data): # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"] if (re.search('dell', raw_data, re.IGNORECASE) != None): return self.brands.get_dell() elif (re.search('asus', raw_data, re.IGNORECASE) != None): return self.brands.get_asus() elif (re.search('apple', raw_data, re.IGNORECASE) != None): return self.brands.get_apple() elif (re.search('acer', raw_data, re.IGNORECASE) != None): return self.brands.get_acer() elif (re.search('samsung', raw_data, re.IGNORECASE) != None): return self.brands.get_samsung() elif (re.search('positivo', raw_data, re.IGNORECASE) != None): return self.brands.get_positivo() elif (re.search('lenovo', raw_data, re.IGNORECASE) != None): return self.brands.get_lenovo() elif (re.search('lg', raw_data, re.IGNORECASE) != None): return self.brands.get_lg() def normalize_processor(self, raw_data): # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron'] # remove erros de enconding (ex: \u84d2) raw_data = re.sub('\\\u\w\w\w\w', '', raw_data) if (re.search("i3", raw_data, re.IGNORECASE) != None): return self.processors.get_i3() elif (re.search("i5", raw_data, re.IGNORECASE) != None): return self.processors.get_i5() elif (re.search("i7", raw_data, re.IGNORECASE) != None): return self.processors.get_i7() elif (re.search("Pentium", raw_data, re.IGNORECASE) != None): return self.processors.get_pentium_quad() elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None): return self.processors.get_baytrail() elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None): return self.processors.get_amd_dual() elif (re.search("atom", raw_data, re.IGNORECASE) != None): return self.processors.get_atom() elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None): return self.processors.get_core_m() elif (re.search("Celeron", raw_data, re.IGNORECASE) != None): return self.processors.get_celeron() elif (re.search("arm", raw_data, re.IGNORECASE) != None): return self.processors.get_arm_a9() elif (re.search("samsung", raw_data, re.IGNORECASE) != None): return self.processors.get_samsung()