Example #1
0
class DataExtractor:
    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()
        self.storages = Storages()

    def parse(self):

        data = {}

        r = self.response

        # produtos da shoptime
        data['store'] = 'shoptime'

        # nome do produto
        try:
            data['name'] = r.find('h1', {'id': 'main-product-name'})
            data['name'] = self.validate_field(data, 'name')
            data['name'] = re.sub(re.compile(u' \(Cód\. ([0-9])+\)'), '',
                                  data['name'])
        except (ValueError, TypeError, AttributeError):
            data['name'] = ''

            # url como variavel global da classe
        data['url'] = self.url

        # preço do produto
        try:
            data['price'] = r.find(
                'span', {'data-partner-value': True})['data-partner-value']
            data['price'] = self.normalize_price(data['price'])
        except (ValueError, TypeError, AttributeError):
            data['price'] = 0.0

        # disponibilidade
        try:
            data['available'] = self.set_available(r, data['price'])
        except (ValueError, TypeError, AttributeError):
            data['available'] = False

        try:
            # processador
            data['processor'] = r.find(
                'th', text=re.compile(r'Processador')).parent.find('td').text
            data['processor'] = self.normalize_processor(data['processor'])
        except (ValueError, TypeError, AttributeError):
            data['processor'] = ''

        # marca
        try:
            data['brand'] = r.find(
                'th', text=re.compile(r'Marca')).parent.find('td').text
            data['brand'] = (self.normalize_brand(data['name'])).strip()
        except (ValueError, TypeError, AttributeError):
            data['brand'] = ''

            # memória ram
        try:
            data['ram_memory'] = r.find(
                'th', text=re.compile(u'Memória RAM')).parent.find('td').text
        except (ValueError, TypeError, AttributeError):
            data['ram_memory'] = ''

            # sku para identificação
        try:
            data['sku'] = r.find('div', {'data-sku': True})['data-sku']
        except (ValueError, TypeError, AttributeError):
            data['sku'] = ''

        # armazenamento (SSD/HD)
        try:

            try:
                hd = r.find('th',
                            text=re.compile(r'HD')).parent.find('td').text
            except (ValueError, TypeError, AttributeError):
                hd = ''

            try:
                ssd = r.find('th',
                             text=re.compile(r'SSD')).parent.find('td').text
            except (ValueError, TypeError, AttributeError):
                ssd = ''

            data['storage'] = self.normalize_storage(hd, ssd)
        except (ValueError, TypeError, AttributeError):
            data['storage'] = self.normalize_storage('', '')

        # tamanho da tela
        try:
            data['display_size'] = self.normalize_display_size(
                r.find('th', text=re.compile(
                    r'Polegadas da Tela')).parent.find('td').text)
        except (ValueError, TypeError, AttributeError):
            data['display_size'] = ''

        try:
            data['img_url'] = (r.findAll(
                'img', {'class': 'p-image'})[0]['src']).strip()
        except (ValueError, TypeError, AttributeError, IndexError):
            data['img_url'] = ''

        return data

    def set_available(self, response, price):
        meta = response.find('div', {'class': 'product-unavailable'})
        if (meta == None) and (price != 0.0):
            return True
        else:
            return False

    def validate_field(self, data, field):
        if data[field] != None:
            return (data[field].get_text().strip() if
                    (len(data[field]) > 0) else '')

    def normalize_display_size(self, text):
        if text != None and len(text) > 0:
            return (text.strip(" \"").replace(",", ".") + "\"").strip()
        else:
            return ''

    def normalize_storage(self, hd, ssd):
        result = ''
        if hd != None and len(hd) > 0:
            result = re.search('\d+.+[TG]B', hd)
            if result != None:
                return self.get_storage_capacity(result.group())

        if ssd != None and len(ssd) > 0:
            result = re.search('\d+.+[TG]B', ssd)
            if result != None:
                return self.get_storage_capacity(result.group())

    def normalize_memory(self, raw_data):
        if re.search('16', raw_data, re.IGNORECASE) != None:
            return self.memory.get_16GB()
        elif re.search('12', raw_data, re.IGNORECASE) != None:
            return self.memory.get_12GB()
        elif re.search('14', raw_data, re.IGNORECASE) != None:
            return self.memory.get_14GB()
        elif re.search('10', raw_data, re.IGNORECASE) != None:
            return self.memory.get_10GB()
        elif re.search('8', raw_data, re.IGNORECASE) != None:
            return self.memory.get_8GB()
        elif re.search('6', raw_data, re.IGNORECASE) != None:
            return self.memory.get_6GB()
        elif re.search('4', raw_data, re.IGNORECASE) != None:
            return self.memory.get_4GB()
        elif re.search('2', raw_data, re.IGNORECASE) != None:
            return self.memory.get_2GB()
        elif re.search('1', raw_data, re.IGNORECASE) != None:
            return self.memory.get_1GB()

    def normalize_price(self, raw_data):
        try:  # transforma 1.000, 00 em 1000.00
            raw_data = (raw_data if len(raw_data) > 0 else '')
            price = float(raw_data)
            price = round(price, 2)
            return price
        except ValueError:
            return 0.0

    def normalize_brand(
        self, raw_data
    ):  # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"]

        if re.search('dell', raw_data, re.IGNORECASE) != None:
            return self.brands.get_dell()
        elif re.search('asus', raw_data, re.IGNORECASE) != None:
            return self.brands.get_asus()
        elif re.search('apple', raw_data, re.IGNORECASE) != None:
            return self.brands.get_apple()
        elif re.search('acer', raw_data, re.IGNORECASE) != None:
            return self.brands.get_acer()
        elif re.search('samsung', raw_data, re.IGNORECASE) != None:
            return self.brands.get_samsung()
        elif re.search('positivo', raw_data, re.IGNORECASE) != None:
            return self.brands.get_positivo()
        elif re.search('lenovo', raw_data, re.IGNORECASE) != None:
            return self.brands.get_lenovo()
        elif re.search('lg', raw_data, re.IGNORECASE) != None:
            return self.brands.get_lg()
        elif re.search('compaq', raw_data, re.IGNORECASE) != None:
            return self.brands.get_compaq()
        elif re.search('seagate', raw_data, re.IGNORECASE) != None:
            return self.brands.get_seagate()
        elif re.search('gigabyte', raw_data, re.IGNORECASE) != None:
            return self.brands.get_gigabyte()
        elif (re.search('hp', raw_data, re.IGNORECASE) != None):
            return self.brands.get_hp()
        elif (re.search('sony', raw_data, re.IGNORECASE) != None):
            return self.brands.get_sony()

    def normalize_processor(self, raw_data):
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if re.search('i3', raw_data, re.IGNORECASE) != None:
            return self.processors.get_i3()
        elif re.search('i5', raw_data, re.IGNORECASE) != None:

            return self.processors.get_i5()
        elif re.search('i7', raw_data, re.IGNORECASE) != None:

            return self.processors.get_i7()
        elif re.search('Pentium', raw_data, re.IGNORECASE) != None:

            return self.processors.get_pentium_quad()
        elif re.search('byt|baytrail', raw_data, re.IGNORECASE) != None:

            return self.processors.get_baytrail()
        elif re.search('amd.+dual core', raw_data, re.IGNORECASE) \
                != None:

            return self.processors.get_amd_dual()
        elif re.search('atom', raw_data, re.IGNORECASE) != None:

            return self.processors.get_atom()
        elif re.search('Intel.+Core.+M', raw_data, re.IGNORECASE) \
                != None:

            return self.processors.get_core_m()
        elif re.search('Celeron', raw_data, re.IGNORECASE) != None:

            return self.processors.get_celeron()
        elif re.search('arm', raw_data, re.IGNORECASE) != None:

            return self.processors.get_arm_a9()
        elif re.search('samsung', raw_data, re.IGNORECASE) != None:

            return self.processors.get_samsung()

    # normalização de capacidade
    def get_storage_capacity(self, raw_data):
        if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_2tb()

        elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_1tb()

        elif (re.search('750 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_750()

        elif (re.search('640 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_640()

        elif (re.search('500 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_500()

        elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_320()

        elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_256()

        elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_160()

        elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_128()

        elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_80()

        elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_64()

        elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_32()

        elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_16()
Example #2
0
class DataExtractor():
    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()
        self.storages = Storages()

    #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url}

    # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url

    def parse(self):
        data = {}

        # produtos das casas bahia
        data['store'] = "casas_bahia"

        # nome do produto
        data['name'] = self.response.findAll("b", {"itemprop": "name"})
        data['name'] = self.validate_field(data, 'name')

        # url como variavel global da classe
        data['url'] = self.url

        # preco do produto
        data['price'] = self.response.findAll("i", {"class": "sale price"})
        data['price'] = self.normalize_price(data['price'])

        # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel
        data['available'] = data['price'] != None and data['price'] != 0.0

        data['img_url'] = self.response.findAll('img', {'itemprop': 'image'})
        data['img_url'] = self.normalize_img_url(data['img_url'])

        # processador
        data['processor'] = self.response.findAll("", {"class": "Processador"})
        data['processor'] = self.normalize_processor(
            self.validate_field(data, 'processor'))

        # marca
        data['brand'] = self.normalize_brand(data['name'])

        # memoria ram
        data['ram_memory'] = self.response.findAll("dl",
                                                   {"class": "Memoria-RAM"})
        data['ram_memory'] = self.normalize_memory(
            self.validate_field(data, 'ram_memory'))

        # sku para identificacao
        data['sku'] = self.url.split('?')[0].split('-')[-1].split('.')[0]

        # armazenamento (SSD/HD)
        hd = self.response.findAll(
            "dl", {"class": ["Disco-rigido--HD-", "Memoria-Flash--SSD-"]})
        data['storage'] = self.normalize_storage(hd)

        # tamanho de tela
        data['display_size'] = self.response.findAll(
            "dl", {"class": "Tamanho-da-tela"})
        data['display_size'] = data['display_size'][0].find(
            'dd').get_text().strip() if (len(data["display_size"]) > 0) else ""

        return data

    def validate_field(self, data, field):
        return (data[field][0].get_text().strip() if
                (len(data[field]) > 0) else "")

    def normalize_img_url(self, img_url):
        return img_url[0]['src'] if (len(img_url) > 0) else None

    def normalize_storage(self, hd):
        if (len(hd) > 0):
            hd = hd[0].find('dd').get_text()

        result = ''
        if hd != None and len(hd) > 0:
            result = re.search('\d+.+[TG]B', hd)
            if result != None:
                result = self.get_storage_capacity(result.group())

        return result

    def normalize_memory(self, raw_data):
        if (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()
        elif (re.search('12GB|12 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('14GB|14 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('10GB|10 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('8GB|8 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('6GB|6 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('4GB|4 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('2GB|2 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('1GB|1 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()

    # transforma 1.000,00 em 1000.00
    def normalize_price(self, raw_data):
        try:
            raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace('.', '').replace(',', '.')
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_brand(self, raw_data):
        if (re.search('dell', raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()
        elif (re.search('hp', raw_data, re.IGNORECASE) != None):
            return self.brands.get_hp()
        elif (re.search('sony', raw_data, re.IGNORECASE) != None):
            return self.brands.get_sony()

    # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron']
    def normalize_processor(self, raw_data):

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()

        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()

        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()

        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()

        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()

        elif (re.search('Intel.+[Dd]ual [Cc]ore', raw_data, re.IGNORECASE) !=
              None):
            return self.processors.get_intel_dual()

        elif (re.search('Intel.+[Qq]uad [Cc]ore', raw_data, re.IGNORECASE) !=
              None):
            return self.processors.get_intel_quad()

        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()

        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()

        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()

        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()

        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()

        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()

    # normalização de capacidade
    def get_storage_capacity(self, raw_data):
        if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_2tb()

        elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_1tb()

        elif (re.search('750 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_750()

        elif (re.search('640 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_640()

        elif (re.search('500 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_500()

        elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_320()

        elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_256()

        elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_160()

        elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_128()

        elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_80()

        elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_64()

        elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_32()

        elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_16()
Example #3
0
class DataExtractor():

	def __init__(self, response, url):
		self.response = response
		self.url = url
		self.processors = Processors();
		self.brands = Brands();
		self.memory = Memory();

	def parse(self):
		namespaces = {'re': "http://exslt.org/regular-expressions"}
		data = {}
		data["store"] = "eletro_shopping"
		data["name"] = self.response.xpath('//div[@itemprop="name"]/h1//text()')
		data["name"] = self.validate_field(data, 'name')
		data["url"] = self.url
		data["price"] = self.response.xpath('//span[@itemprop="lowPrice"]//text()')
		data["price"] = self.normalize_price(data["price"])
		data["available"] = data["price"] != None and data["price"] != 0.0
		data["processor"] = self.response.xpath('//td[text()="Processador - Modelo"]/following-sibling::td//text()')
		data["processor"] = self.normalize_processor(self.validate_field(data, "processor"))
		data["brand"] = self.normalize_brand(data["name"])
		data["ram_memory"] = self.response.xpath('//td[contains(text(), "Mem") and contains(text(), "Capacidade")]/following-sibling::td//text()', namespaces = {'re' : 'http://exslt.org/regular-expressions'})
		data["ram_memory"] = self.normalize_memory(self.validate_field(data, "ram_memory"))
		data["sku"] = self.response.xpath('//div[@itemprop="name"]/span//text()')
		if(len(data['sku']) > 0):
			data["sku"] = data["sku"][0].split(':')[1].strip()
		else:
			data['sku'] = ''
		data["disco"] = {}
		disco = self.response.xpath('//td[text()="HD - Tipo"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"})
		hd = ''
		ssd = ''
		if(len(disco) > 0):
			disco = disco[0]
			if(re.search('HD', disco, re.IGNORECASE) != None and re.search('SSD', disco, re.IGNORECASE) != None):
				aux = self.response.xpath('//td[text()="HD - Capacidade"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"})
				if(len(aux) > 0):
					aux = aux[0];
					if(re.search('\\+', aux) != None):
						hd = aux.split('+')[0]
						ssd = aux.split('+')[1]
					else:
						hd = disco.split('HD')[0].strip();
						ssd = disco.split(' ')
						ssd = ssd[ssd.index('SSD') - 1]
			elif(re.search('HD', disco, re.IGNORECASE) != None):
				hd = self.response.xpath('//td[text()="HD - Capacidade"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"})
				ssd = ''
			elif(re.search('SSD', disco, re.IGNORECASE) != None):
				hd = ''
				ssd = self.response.xpath('//td[text()="HD - Capacidade"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"})
		data['disco']['hd'] = hd
		data['disco']['ssd'] = ssd
		data["display_size"] = self.response.xpath('//td[text()="Tela - Tamanho"]/following-sibling::td//text()', namespaces={'re': "http://exslt.org/regular-expressions"})
		data["display_size"] = self.validate_field(data, "display_size")

		return data

	def validate_field(self, data, field):
		return (data[field][0].strip() if (len(data[field]) > 0) else "")

	# normalize storage

	def normalize_memory(self, raw_data):
		if (re.search('16', raw_data, re.IGNORECASE) != None):
			return self.memory.get_16GB()
		elif (re.search('14', raw_data, re.IGNORECASE) != None):
			return self.memory.get_14GB()
		elif (re.search('12', raw_data, re.IGNORECASE) != None):
			return self.memory.get_12GB()
		elif (re.search('10', raw_data, re.IGNORECASE) != None):
			return self.memory.get_10GB()
		elif (re.search('8', raw_data, re.IGNORECASE) != None):
			return self.memory.get_8GB()
		elif (re.search('6', raw_data, re.IGNORECASE) != None):
			return self.memory.get_6GB()
		elif (re.search('4', raw_data, re.IGNORECASE) != None):
			return self.memory.get_4GB()
		elif (re.search('2', raw_data, re.IGNORECASE) != None):
			return self.memory.get_2GB()
		elif (re.search('1', raw_data, re.IGNORECASE) != None):
			return self.memory.get_1GB()

	def normalize_price(self, raw_data):
		try:
			raw_data = raw_data[0].strip() if (len(raw_data) > 0) else ""
			raw_data = raw_data.replace(".", "").replace(",", ".")
			return float(raw_data)
		except ValueError:
			return 0.0

	def normalize_brand(self, raw_data):
		if (re.search("dell", raw_data, re.IGNORECASE) != None):
			return self.brands.get_dell()
		elif (re.search('asus', raw_data, re.IGNORECASE) != None):
			return self.brands.get_asus()
		elif (re.search('apple', raw_data, re.IGNORECASE) != None):
			return self.brands.get_apple()
		elif (re.search('acer', raw_data, re.IGNORECASE) != None):
			return self.brands.get_acer()
		elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
			return self.brands.get_samsung()
		elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
			return self.brands.get_positivo()
		elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
			return self.brands.get_lenovo()
		elif (re.search('lg', raw_data, re.IGNORECASE) != None):
			return self.brands.get_lg()

	def normalize_processor(self, raw_data):

		# remove erros de enconding (ex: \u84d2)
		raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

		if (re.search("i3", raw_data, re.IGNORECASE) != None):
			return self.processors.get_i3()
		elif (re.search("i5", raw_data, re.IGNORECASE) != None):
			return self.processors.get_i5()
		elif (re.search("i7", raw_data, re.IGNORECASE) != None):
			return self.processors.get_i7()
		elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
			return self.processors.get_pentium_quad()
		elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
			return self.processors.get_baytrail()
		elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
			return self.processors.get_amd_dual()
		elif (re.search("atom", raw_data, re.IGNORECASE) != None):
			return self.processors.get_atom()
		elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
			return self.processors.get_core_m()
		elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
			return self.processors.get_celeron()
		elif (re.search("arm", raw_data, re.IGNORECASE) != None):
			return self.processors.get_arm_a9()
		elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
			return self.processors.get_samsung()
Example #4
0
class DataExtractor():
    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()

    #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url}

    # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url

    def parse(self):
        data = {}

        r = self.response

        # produtos das casas bahia
        data['store'] = "ibyte_computadores"

        # nome do produto
        data['name'] = self.response.findAll("li", {"class": "product"})
        data['name'] = self.validate_field(data, 'name')

        # url como variavel global da classe
        data['url'] = self.url

        # nome do produto
        data['price'] = self.response.find("div", {
            "class": "preco-produto"
        }).parent.findAll('span', {"class": "price"})
        data['price'] = self.normalize_price(data['price'])

        # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel
        data['available'] = data['price'] != None and data['price'] != 0.0

        try:
            # processador
            data['processor'] = r.find(
                'td', text=re.compile(r'Processador:')).parent.find(
                    'td', {
                        'width': '570'
                    }).text
            data['processor'] = self.normalize_processor(data['processor'])
        except (ValueError, TypeError, AttributeError):
            data['processor'] = ''

        # marca
        try:
            data['brand'] = r.find('td',
                                   text=re.compile(r'Marca:')).parent.find(
                                       'td', {
                                           'width': '570'
                                       }).text.strip()
        except (ValueError, TypeError, AttributeError):
            data['brand'] = ''

        # memória ram
        try:
            data['ram_memory'] = r.find(
                'td', text=re.compile(u'Memória RAM:')).parent.find(
                    'td', {
                        'width': '570'
                    }).text.strip()
            data['ram_memory'] = self.normalize_memory(data['ram_memory'])
        except (ValueError, TypeError, AttributeError):
            data['ram_memory'] = ''

        # sku para identificação
        try:
            data['sku'] = r.find('div', {
                'class': 'product-essential'
            }).parent.find('h3').text.split(' ')[-1].split(')')[0]
        except (ValueError, TypeError, AttributeError):
            data['sku'] = ''

        # armazenamento (SSD/HD)
        try:

            try:
                hd = r.find('td', text=re.compile(r'HD:')).parent.find(
                    'td', {
                        'width': '570'
                    }).text
            except (ValueError, TypeError, AttributeError):
                hd = ''

            try:
                ssd = r.find('td', text=re.compile(r'SSD:')).parent.find(
                    'td', {
                        'width': '570'
                    }).text
            except (ValueError, TypeError, AttributeError):
                ssd = ''

            data['storage'] = self.normalize_storage(hd, ssd)
        except (ValueError, TypeError, AttributeError):
            data['storage'] = {}

        # tamanho da tela
        try:
            data['display_size'] = r.find(
                'td', text=re.compile(r'Polegadas da Tela:')).parent.find(
                    'td', {
                        'width': '570'
                    }).text.strip()
        except (ValueError, TypeError, AttributeError):
            data['display_size'] = ''

        #imagem do produto
        try:
            data['img_url'] = (r.findAll('img',
                                         {'id': 'image'})[0]['src']).strip()
        except (ValueError, TypeError, AttributeError, IndexError):
            data['img_url'] = ''

        return data

    def validate_field(self, data, field):
        return (data[field][0].get_text().strip() if
                (len(data[field]) > 0) else "")

    def normalize_storage(self, hd, ssd):

        result = {}

        if hd != None and len(hd) > 0:
            result["HD"] = re.search('\d+TB', hd)
            if result["HD"] != None:
                result["HD"] = result["HD"].group()

        if ssd != None and (len(ssd) > 0) and result == None:
            result["SSD"] = re.search('\d+TB', ssd)
            if result["SSD"] != None:
                result["SSD"] = result["SSD"].group()

        return result

    def normalize_memory(self, raw_data):
        if (re.search('16', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()
        elif (re.search('12', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('14', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('10', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('8', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('6', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('4', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('2', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('1', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()

    def normalize_price(self, raw_data):
        try:
            # transforma 1.000,00 em 1000.00
            raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace('.', '').replace(',', '.')
            raw_data = raw_data.replace('R$', '').replace(',', '.')
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_brand(self, raw_data):
        # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"]

        if (re.search('dell', raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()

    def normalize_processor(self, raw_data):
        # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron']

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()

        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()

        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()

        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()

        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()

        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()

        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()

        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()

        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()

        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()

        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()
Example #5
0
class DataExtractor():
    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()
        self.storages = Storages()

    #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url}

    # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url

    def parse(self):
        data = {}

        r = self.response

        # produtos das casas bahia
        data['store'] = "mega_eletronicos"

        # nome do produto
        try:
            data['name'] = r.find('div', {
                'class': 'col-md-12 col-sm-12'
            }).parent.find('h1').text
            data['name'] = self.validate_field(data, 'name')
        except (ValueError, TypeError, AttributeError):
            data['name'] = ''

        # url como variavel global da classe
        data['url'] = self.url

        # preço do produto
        try:
            data['price'] = r.find('h3', {
                'class': 'real'
            }).parent.find('span').text
            data['price'] = self.normalize_price(data['price'])
        except (ValueError, TypeError, AttributeError):
            data['price'] = 0.0

        # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel
        data['available'] = data['price'] != None and data['price'] != 0.0

        try:
            # processador
            data['processor'] = r.find(
                'td',
                text=re.compile(r'(Processor|Processador|Intel)')).parent.find(
                    'td', {
                        'width': '65%'
                    }).text
            data['processor'] = self.normalize_processor(data['processor'])
        except (ValueError, TypeError, AttributeError):
            data['processor'] = ''

        # marca
        try:
            data['brand'] = r.find('div', {
                'class': 'col-md-12 col-sm-12'
            }).parent.find('h1').text
            data['brand'] = self.normalize_brand(data['brand'])
        except (ValueError, TypeError, AttributeError):
            data['brand'] = ''

        # memória ram
        try:
            data['ram_memory'] = r.find(
                'td', text=re.compile(r'(DDR|SDRAM|RAM)')).parent.find(
                    'td', {
                        'width': '65%'
                    }).text
            data['ram_memory'] = self.normalize_memory(data['ram_memory'])
        except (ValueError, TypeError, AttributeError):
            data['ram_memory'] = ''

        # sku para identificacao
        data['sku'] = self.url.split('?')[0].split('-')[-1].split('.')[0]

        # armazenamento (SSD/HD)
        try:
            data['storage'] = self.normalize_storage(
                r.find('td', text=re.compile(r'(rpm|HDD|SSD)')).parent.find(
                    'td', {
                        'width': '65%'
                    }).text)
        except (ValueError, TypeError, AttributeError):
            data['storage'] = ''

        # tamanho da tela
        try:
            #data['display_size'] = r.find('td', text=re.compile(r'(LED|LCD)')).parent.find('td', {'width': '65%'}).text
            #data['display_size'] = re.sub(r'([a-z])', '', data['display_size'])
            #data['display_size'] = re.sub(r'([A-Z])', '', data['display_size'])
            # comentei tudo pois não consegui formalizar os tamanhos da tela
            data['display_size'] = ''
        except (ValueError, TypeError, AttributeError):
            data['display_size'] = ''

        return data

    def validate_field(self, data, field):
        return (data[field][0].get_text().strip() if
                (len(data[field]) > 0) else "")

    def normalize_storage(self, hd):

        result = ''
        if hd != None and len(hd) > 0:
            result = re.search('\d+.+[TG]B', hd)
            if result != None:
                result = result.group()

        return self.get_storage_capacity(result)

    def normalize_memory(self, raw_data):
        if (re.search('16', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()
        elif (re.search('12', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('14', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('10', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('8', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('6', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('4', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('2', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('1', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()

    def normalize_price(self, raw_data):
        try:
            # transforma 1.000,00 em 1000.00
            #raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace('.', '').replace(',', '.')
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_brand(self, raw_data):
        # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"]

        if (re.search('dell', raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()

    def normalize_processor(self, raw_data):
        # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron']

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()

        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()

        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()

        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()

        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()

        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()

        elif (re.search("amd.+quad core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_quad()

        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()

        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()

        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()

        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()

        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()

        # normalização de capacidade
    def get_storage_capacity(self, raw_data):
        if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_2tb()

        elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_1tb()

        elif (re.search('750 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_750()

        elif (re.search('640 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_640()

        elif (re.search('500 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_500()

        elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_320()

        elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_256()

        elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_160()

        elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_128()

        elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_80()

        elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_64()

        elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_32()

        elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_16()
Example #6
0
class DataExtractor():

    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()

    #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url}

    # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url

    def parse(self):
        data = {}

        # produtos das casas bahia
        data['store'] = "submarino"

        # nome do produto
        data['name'] = self.self.response.xpath('/html/body/div[5]/section/div/div/div[2]/div/div[1]/h1/span/text()')[0].strip()
        data['name'] = self.validate_field(data, 'name')

        # url como variavel global da classe
        data['url'] = self.url

#        # preco do produto
        data['price'] = self.response.findAll("span", {"itemprop": "price/salesPrice"})
        data['price'] = self.normalize_price(data['price'])

        # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel
        data['available'] = data['price'] != None and data['price'] != 0.0

        # processador
        data['processor'] = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[6]/td/text()')
        data['processor'] = self.normalize_processor(self.validate_field(data, 'processor'))

        # marca
        data['brand'] = self.normalize_brand(data['name'])

        # memoria ram
        data['ram_memory'] = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[9]/td/text()')[0].strip()
        data['ram_memory'] = self.normalize_memory(self.validate_field(data, 'ram_memory'))

        # sku para identificacao
        data['sku'] = self.url.split('/')[2].split('/')[-1]

#        # armazenamento (SSD/HD) ------ observar busca por 'SSD'
        hd = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[11]/td/text()')[0].strip()
#        ssd = self.reponse.findAll("SSD")
#        if (ssd)
        ssd = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[10]/td/text()')[0].strip
#        elif
#          ssd = ""
        data['storage'] = self.normalize_storage(hd, ssd)

        # tamanho de tela
        data['display_size'] = self.response.xpath('//*[@id="productdetails"]/div[3]/section/table/tbody/tr[4]/td/text()')[0].strip()
        data['display_size'] = data['display_size'][0].find('dd').get_text().strip() if (len(data["display_size"]) > 0) else ""

        return data

    def validate_field(self, data, field):
        return (data[field][0].get_text().strip() if (len(data[field]) > 0) else "")

    def normalize_storage(self, hd, ssd):
        if (len(hd) > 0):
            hd = hd[0].find('dd').get_text()
        if (len(ssd) > 0):
            ssd = ssd[0].find('dd').get_text()

        result = ''
        if hd != None and len(hd) > 0:
            result = re.search('\d+.+[TG]B', hd)
            if result != None:
                return self.get_storage_capacity(result.group())

        if ssd != None and len(ssd) > 0:
            result = re.search('\d+.+[TG]B', ssd)
            if result != None:
                return self.get_storage_capacity(result.group())

    def normalize_memory(self, raw_data):
        if (re.search('16', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()
        elif (re.search('12', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('14', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('10', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('8', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('6', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('4', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('2', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('1', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()

    def normalize_price(self, raw_data):
        try:
            # transforma 1.000,00 em 1000.00
            raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace('.', '').replace(',', '.')
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_brand(self, raw_data):
        # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"]

        if (re.search('dell', raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()

    def normalize_processor(self, raw_data):
        # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron']

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()

        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()

        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()

        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()

        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()

        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()

        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()

        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()

        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()

        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()

        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()

    def get_storage_capacity(self, raw_data):
        if (re.search('2TB|2 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_2tb()

        elif (re.search('1TB|1 TB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_1tb()

        elif (re.search('750 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_750()

        elif (re.search('640 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_640()

        elif (re.search('500 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_500()

        elif (re.search('320GB|320 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_320()

        elif (re.search('256GB|256 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_256()

        elif (re.search('160GB|160 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_160()

        elif (re.search('128GB|128 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_128()

        elif (re.search('80GB|80 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_80()

        elif (re.search('64GB|64 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_64()

        elif (re.search('32GB|32 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_32()

        elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None):
            return self.storages.get_16()
Example #7
0
class DataExtractor():

    DOWNLOAD_DELAY = 5

    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()

    def parse(self):
        namespaces = {'re': "http://exslt.org/regular-expressions"}
        data = {}
        data["store"] = "novo_mundo"
        data["name"] = self.response.xpath(
            '//div[@class="productName"]//text()')
        print self.response.xpath('//div[@class="productName"]')
        data['name'] = self.validate_field(data, 'name')
        data['url'] = self.url
        data['price'] = self.response.xpath(
            '//strong[@class="skuBestPrice"]//text()')[0].split(' ')[1]
        data['price'] = self.normalize_price(data['price'])
        data["available"] = data["price"] != None and data["price"] != 0.0
        data['processor'] = self.response.xpath(
            '//td[@class="value-field" and @class="Processador"]//text()')
        data['processor'] = self.normalize_processor(
            self.validate_field(data, 'processor'))
        data['brand'] = self.normalize_brand(data['name'])
        data['ram_memory'] = self.response.xpath(
            '//td[@class="value-field" and @class="Memoria"]//text()')
        data['ram_memory'] = self.normalize_memory(
            self.validate_field(data, 'ram_memory'))
        data['sku'] = self.response.xpath(
            '//div[@class="skuReference"]//text()')[0]
        data['sku'] = self.validate_field(data, 'sku')
        hd = self.response.xpath(
            '//td[@class="value-field" and @class="HD"]//text()')[0]
        data['display_size'] = self.response.xpath(
            '//td[@class="value-field" and @class="Tela"]//text()')
        data['display_size'] = data['display_size'].split('\n')[0].split(
            ' ')[1]
        data['display_size'] = self.validate_field(data, 'display_size')

        return data

    def validate_field(self, data, field):
        return (data[field][0].strip() if (len(data[field]) > 0) else "")

    # normalize storage

    def normalize_memory(self, raw_data):
        if (re.search('16', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()
        elif (re.search('14', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('12', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('10', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('8', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('6', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('4', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('2', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('1', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()

    def normalize_price(self, raw_data):
        try:
            raw_data = raw_data[0].strip() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace(".", "").replace(",", ".")
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_brand(self, raw_data):
        if (re.search("dell", raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()

    def normalize_processor(self, raw_data):

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()
        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()
        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()
        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()
        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()
        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()
        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()
        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()
        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()
        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()
        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()
Example #8
0
class DataExtractor():
    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()

    def parse(self):
        namespaces = {'re': "http://exslt.org/regular-expressions"}
        data = {}
        data["store"] = "eletrosom"
        data['name'] = self.response.xpath(
            '//div[@class="about"]/div[@class="meta"]/h1//text()')
        data['name'] = self.validate_field(data, 'name')
        data['url'] = self.url
        data['price'] = self.response.xpath(
            '//span[@class="regular-price"]/span/strong//text()')[0]
        data['price'] = data['price'].split('$')[1]
        data['price'] = self.normalize_price(data['price'])
        if not self.response.xpath('//div[@class="indisponivel"]'):
            data['available'] = True
        else:
            data['available'] = False
        data['processor'] = self.response.xpath(
            '//td[text()="Processador"]/following-sibling::td//text()')
        data["processor"] = self.normalize_processor(
            self.validate_field(data, "processor"))
        data['brand'] = self.normalize_brand(data['name'])
        data['ram_memory'] = self.response.xpath(
            '//td[contains(text(), "Mem") and contains(text(), "ria") or contains(text(), "RAM")]/following-sibling::td//text()',
            namespaces={'re': 'http://exslt.org/regular-expressions'})
        data["ram_memory"] = self.normalize_memory(
            self.validate_field(data, "ram_memory"))
        data['sku'] = self.response.xpath('//p[@class="code"]//text()')
        data['sku'] = data['sku'][0].split(':')[1].split('/')[0].strip()
        data["disco"] = {}
        disco = self.response.xpath(
            '//td[contains(text(), "Disco") and contains(text(), "gido")]//text()'
        )[0]
        if disco:
            if (re.search('ssd', disco, re.IGNORECASE) != None):
                ssd = self.response.xpath(
                    '//td[contains(text(), "Disco") and contains(text(), "gido")]/following-sibling::td//text()'
                )
                hd = ''
            else:
                ssd = ''
                hd = self.response.xpath(
                    '//td[contains(text(), "Disco") and contains(text(), "gido")]/following-sibling::td//text()'
                )
        data['disco']['hd'] = hd
        data['disco']['ssd'] = ssd
        if self.response.xpath('//td[text()="Tela"]'):
            data["display_size"] = self.response.xpath(
                '//td[text()="Tela"]/following-sibling::td//text()')
        else:
            data["display_size"] = self.response.xpath(
                '//td[contains(text(), "Tamanho") and contains(text(), "Tela")]/following-sibling::td//text()'
            )
        data["display_size"] = self.validate_field(data, "display_size")

        return data

    def validate_field(self, data, field):
        return (data[field][0].strip() if (len(data[field]) > 0) else "")

    # normalize storage

    def normalize_memory(self, raw_data):
        if (re.search('1GB|1 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()
        elif (re.search('2GB|2 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('4GB|4 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('6GB|6 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('8GB|8 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('10GB|10 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('12GB|12 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('14GB|14 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('16GB|16 GB', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()

    def normalize_price(self, raw_data):
        try:
            # raw_data = raw_data[0].strip() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace(".", "").replace(",", ".")
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_brand(self, raw_data):
        if (re.search("dell", raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()

    def normalize_processor(self, raw_data):

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()
        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()
        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()
        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()
        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()
        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()
        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()
        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()
        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()
        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()
        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()
Example #9
0
class DataExtractor():
    def __init__(self, response, url):
        self.response = response
        self.url = url
        self.processors = Processors()
        self.brands = Brands()
        self.memory = Memory()

    #{ _id, available, brand, color, display_feature, display_size, graphics_processor_name, graphics_processor, name, operating_system, price, processor, ram_memory, sku, screen_resolution, storage, storage_type, url, img_url}

    # TODO: display_feature, display_size, graphics_processor_name, graphics_processor, operating_system, screen_resolution, storage_type, img_url

    def parse(self):
        data = {}

        # produtos da havan
        data['store'] = "havan"

        # nome do produto
        data['name'] = self.response.findAll("",
                                             {"class": "product-qd-v1-name"})
        data['name'] = self.validate_field(data, 'name')

        # url como variavel global da classe
        data['url'] = self.url

        # preco do produto
        data['price'] = self.response.findAll("", {"class": "skuBestPrice"})
        data['price'] = self.normalize_price(data['price'])

        # disponibilidade: nas casas bahia, se o produto possuir preco, o produto esta disponivel
        data['available'] = data['price'] != None and data['price'] != 0.0

        # processador
        data['processor'] = self.response.findAll(
            "", {"class": "value-field Modelo-do-Processador"})
        data['processor'] = self.normalize_processor(
            self.validate_field(data, 'processor'))

        # marca
        data['brand'] = self.normalize_brand(data['name'])

        # memoria ram
        data['ram_memory'] = self.response.findAll(
            "td", {"class": "value-field Memoria-Fisica-Disponivel"})
        data['ram_memory'] = self.normalize_memory(
            self.validate_field(data, 'ram_memory'))

        # sku para identificacao
        data['sku'] = self.response.findAll("", {"class": "skuReference"})
        data['sku'] = self.validate_field(data, 'sku')

        # armazenamento (SSD/HD)
        data['storage'] = self.response.findAll(
            "", {"class": "value-field Capacidade-do-Disco-Rigido-HD-"})
        data['storage'] = self.validate_field(data, 'storage')
        #ssd = self.response.findAll("", {"class": "value-field Unidade-de-Estado-Solida-SSD-"})

        # tamanho de tela
        display = self.response.findAll(
            "td", {"class": "value-field Tamanho-da-Tela"})
        data['display_size'] = self.normalize_display(display) if (
            len(display) > 0) else ""

        return data

    def validate_field(self, data, field):
        return (data[field][0].get_text().strip() if
                (len(data[field]) > 0) else "")

    def normalize_memory(self, raw_data):
        if (re.search('16', raw_data, re.IGNORECASE) != None):
            return self.memory.get_16GB()
        elif (re.search('12', raw_data, re.IGNORECASE) != None):
            return self.memory.get_12GB()
        elif (re.search('14', raw_data, re.IGNORECASE) != None):
            return self.memory.get_14GB()
        elif (re.search('10', raw_data, re.IGNORECASE) != None):
            return self.memory.get_10GB()
        elif (re.search('8', raw_data, re.IGNORECASE) != None):
            return self.memory.get_8GB()
        elif (re.search('6', raw_data, re.IGNORECASE) != None):
            return self.memory.get_6GB()
        elif (re.search('4', raw_data, re.IGNORECASE) != None):
            return self.memory.get_4GB()
        elif (re.search('2', raw_data, re.IGNORECASE) != None):
            return self.memory.get_2GB()
        elif (re.search('1', raw_data, re.IGNORECASE) != None):
            return self.memory.get_1GB()

    def normalize_price(self, raw_data):
        try:
            # transforma 1.000,00 em 1000.00
            raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace('.',
                                        '').replace(',',
                                                    '.').replace('R$', '')
            return float(raw_data)
        except ValueError:
            return 0.0

    def normalize_display(self, raw_data):
        try:
            raw_data = raw_data[0].get_text() if (len(raw_data) > 0) else ""
            raw_data = raw_data.replace('"', '')
            return raw_data
        except ValueError:
            return ""

    def normalize_brand(self, raw_data):
        # ["Samsung", "Asus", "Acer", "Dell", "Apple", "Positivo", "LG", "Lenovo"]

        if (re.search('dell', raw_data, re.IGNORECASE) != None):
            return self.brands.get_dell()
        elif (re.search('asus', raw_data, re.IGNORECASE) != None):
            return self.brands.get_asus()
        elif (re.search('apple', raw_data, re.IGNORECASE) != None):
            return self.brands.get_apple()
        elif (re.search('acer', raw_data, re.IGNORECASE) != None):
            return self.brands.get_acer()
        elif (re.search('samsung', raw_data, re.IGNORECASE) != None):
            return self.brands.get_samsung()
        elif (re.search('positivo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_positivo()
        elif (re.search('lenovo', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lenovo()
        elif (re.search('lg', raw_data, re.IGNORECASE) != None):
            return self.brands.get_lg()

    def normalize_processor(self, raw_data):
        # ['Intel Core i3', 'Intel Core i5', 'Intel Core i7', 'Intem Pentium Quad Core', 'Intel Baytrail', 'AMD Dual Core', 'Item Atom', 'Intel Core M', 'Intel Celeron']

        # remove erros de enconding (ex: \u84d2)
        raw_data = re.sub('\\\u\w\w\w\w', '', raw_data)

        if (re.search("i3", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i3()

        elif (re.search("i5", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i5()

        elif (re.search("i7", raw_data, re.IGNORECASE) != None):
            return self.processors.get_i7()

        elif (re.search("Pentium", raw_data, re.IGNORECASE) != None):
            return self.processors.get_pentium_quad()

        elif (re.search("byt|baytrail", raw_data, re.IGNORECASE) != None):
            return self.processors.get_baytrail()

        elif (re.search("amd.+dual core", raw_data, re.IGNORECASE) != None):
            return self.processors.get_amd_dual()

        elif (re.search("atom", raw_data, re.IGNORECASE) != None):
            return self.processors.get_atom()

        elif (re.search("Intel.+Core.+M", raw_data, re.IGNORECASE) != None):
            return self.processors.get_core_m()

        elif (re.search("Celeron", raw_data, re.IGNORECASE) != None):
            return self.processors.get_celeron()

        elif (re.search("arm", raw_data, re.IGNORECASE) != None):
            return self.processors.get_arm_a9()

        elif (re.search("samsung", raw_data, re.IGNORECASE) != None):
            return self.processors.get_samsung()