def getVolumetricWeight(self, dimension, ut): if len(dimension) == 0 or " x " not in dimension: return 0 volumetric_weight = 0 try: di_arr = dimension.split(' x ') dimensions = [] if len(di_arr) == 3: for index in range(len(di_arr)): di = di_arr[index] for pat in self.dimensionPatternsToRemove: di = StringUtil.str_cleaner(di, pat, "").strip() if StringUtil.try_parse('float', di) is True: if ut == UnitTypes.MM: di = math.ceil(float(di) * 0.0393701) elif ut == UnitTypes.CM: di = math.ceil(float(di) * 0.393701) dimensions.append(float(di)) else: # Set value to 1 if the dimension value is invalid to convert to float. # This idea is wrong. Temp solution only. dimensions.append(1) if len(dimensions) == 3: volumetric_weight = reduce(lambda x, y: x * y, dimensions) volumetric_weight = volumetric_weight / 130 volumetric_weight = math.ceil(volumetric_weight) else: volumetric_weight = 0 except: volumetric_weight = 0 return volumetric_weight
def getProductDimension(self, instance_of, instance_val): if self.htmlObject is None: return '' if instance_val is None: return '' content = None dimension = '' if instance_of == "DOCUMENT" and len(instance_val) > 0: content = self.getElementValue(instance_val) elif instance_of == "STRING" and len(instance_val) > 0: content = instance_val content = StringUtil.remove_html_tags(content) content = StringUtil.str_cleaner(content, r'\\([a-z0-9]{3})', '') content = StringUtil.str_cleaner(content, r'[^0-9a-zA-Z\s\-\(\).,"\'&]+', '') if content is not None and type( content) is not None and len(content) > 0: # for pat in self.dimensionPatternsToRemove: # content = StringUtil.str_cleaner(content, pat, "") for sptf in self.dimensionPatterns: if StringUtil.str_find_str(str(content), sptf): dimension = StringUtil.str_search_str(str(content), sptf) break if dimension and len(dimension) == 0 or len(dimension) > 35: dimension = self.getProductWeight(content) return dimension
def parse_item_page(self, response): item_list = [] hxs = HtmlXPathSelector(response) item = response.meta['item'] # 6PM # node_collection = hxs.select("//h1[@class='title']/a|//div[" # "@class='description']/ul/li/span|//div[" # "@class='description']/ul/li/a|//div[@class='description']/ul/li") # Walmart node_collection = hxs.select( "//h1[@itemprop='name']/div|//div[" "@class='product-description-disclaimer']|//div[@class='about-desc']" ) if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) item['keywords'] = StringUtil.remove_html_tags(str( ' '.join(item_list))) return item
def __getDescription(self): item_list = [] node_collection = self.getElementValues("//h1[@class='product-name']|//div[@itemprop='description']/*") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def getProductWeight(self, instance_val): # self.writeToFile('content_raw.txt',instance_val) # self.writeToFile('content.txt',content.strip()) content = '' if instance_val is not None: content = StringUtil.str_cleaner(instance_val, r'<[^>]*>', '') content = StringUtil.str_cleaner(content, r'\s\s', '') content = StringUtil.str_search_str( content, r"(item weight|Shipping Weight)(:|:\s|\s:)(\d+(\.\d{1,2})?)(\s|\S)(ounce|pound|lb\s|lbs)" ) # self.writeToFile('content.txt',content.strip()) return content
def __getDescription(self): item_list = [] node_collection = self.getElementValues("//span[@id='ctl00_ContentPlaceHolder1_ucTemplate_aBrand']|//span[" "@class='productname']|//dl[@id='overview']/dd/p") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def __getDescription(self): item_list = [] node_collection = self.getElementValues("//div[@id='names']/span/a|//div[@id='names']/h1|//div[" "@id='info']/div/p") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def __getDescription(self): item_list = [] node_collection = self.getElementValues("//span[@id='productTitle']|//div[" "@id='fbExpandableSectionContent']/ul/li/span|//div[" "@id='feature-bullets']/ul/li/span") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def __getDescription(self): item_list = [] node_collection = self.getElementValues( "//div[@id='buy-block']/div/h1|//div[" "@class='product-details-description clearfix']/div|//div[" "@class='product-details-description clearfix']/ul/li") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def __getDescription(self): item_list = [] node_collection = self.getElementValues( "//h1[@itemprop='name']|//div[@class='extended-product-details " "hide-when-immersive']/div/div|//div[@class='extended-product-details hide-when-immersive']/div/div/span|//div[@class='extended-product-details hide-when-immersive']/div/div|//div[@itemprop='description']/p|//div[@class='product-details-and-care module-details']/ul/li" ) if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def __getDescription(self): item_list = [] node_collection = self.getElementValues("//div[@class='exp-product-header']/h1|//div[" "@class='exp-product-header']/h2|//div[" "@class='pi-pdpmainbody']/p/b|//div[@class='pi-pdpmainbody']/p|//div[" "@class='pi-pdpmainbody']/li") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) return StringUtil.remove_html_tags(str(' '.join(item_list)))
def parse_item_page(self, response): item_list = [] hxs = Selector(response) item = response.meta['item'] # 6PM node_collection = hxs.xpath( "//h1[@class='title']/a|//div[" "@class='description']/ul/li/span|//div[" "@class='description']/ul/li/a|//div[@class='description']/ul/li") # Walmart # node_collection = hxs.xpath("//h1[@itemprop='name']/div|//div[" # "@class='product-description-disclaimer']|//div[@class='about-desc']") # Amazon # node_collection = hxs.xpath("//span[@id='productTitle']|//div[" # "@id='fbExpandableSectionContent']/ul/li/span|//div[" # "@id='feature-bullets']/ul/li/span") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(value) item['keywords'] = StringUtil.remove_html_tags(str( ' '.join(item_list))) # Create a CSV file for training data with open('train_data.csv', 'ab') as csvfile: trainwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_ALL) trainwriter.writerow( [item['category'], str(item['keywords']).lower()]) return item
def __getTitleInCategoryLevel(self): item_list = [] node_collection = self.getElementValues( "//div[@class='product-v2-name']/h1") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("text()").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append( self.listToJson(['title', 'index'], [value, indx])) return item_list
def __getTitleInCategoryLevel(self): item_list = [] node_collection = self.getElementValues("//div[@class='a-row a-spacing-micro']/a/h2|//div[@class='a-row " "a-spacing-top-mini']/a/h2|//div[@class='a-row " "a-spacing-mini']/a/h2|//div[@class='a-row " "a-spacing-none']/a/h2|//span[@id='productTitle']|//ol[" "@class='class=a-carousel']/li/div/a/span") if node_collection is not None and len(node_collection) > 0: indx = 0 for node in node_collection: value = ''.join(node.xpath("@data-attribute").extract()) if len(value) > 0: indx += 1 value = str(value.strip().encode('utf-8')) value = StringUtil.str_utf_encode(value) item_list.append(self.listToJson(['title', 'index'], [value, indx])) return item_list