def parseData(self, content, url): listMobile = [] listProduct = content.find('ul', attrs={'class': 'homeproduct'}) temp = listProduct.findAll('li') allProducts = [x.find('a', href=True) for x in temp] if len(allProducts) == 0: raise NoProductFoundException for a in allProducts: try: image_html = ScrapEngine.hideInvalidTag( a.find('img'), ['strike']) name_html = ScrapEngine.hideInvalidTag(a.find('h3'), ['strike']) price_html = ScrapEngine.hideInvalidTag( a.find('div', attrs={'class': 'price'}), ['strike', 'span']) image_src = "NA" if 'src' in image_html.attrs: image_src = image_html['src'] elif 'data-original' in image_html.attrs: image_src = image_html['data-original'] name = ScrapEngine.processString(name_html.getText(), self.ignoreTerm) name_idx = name.find(" ") price = ScrapEngine.processString(price_html.getText(), self.ignoreTerm) href = "n.a" href = urljoin(url, a['href']) try: listMobile.append( PhoneData(brand=name, model="", price=price, vendor="thegioididong", info={ "url": href, "img": image_src })) except PhoneDataInvalidException as error: print("Unable to parse: " + name + ": " + price + ". Error:" + str(error)) pass except Exception as e: print("Error: " + str(e)) pass print("Done with: " + url) print("Found {} items".format(str(len(listMobile)))) return listMobile
def test_connecttoWebsite(self): url = os.path.dirname( os.path.realpath(__file__)) + "/../testdata/testWebsite.html" ignoreTerm = ["Chính hãng", "Chính Hãng", "-"] try: result = ScrapEngine.connectToStaticWebSite(url, ignoreTerm) self.assertIsNotNone(result, "Beautiful soup result is none") except: self.fail("Unable to connect to website")
def getAllPages(self): for i in range(3): try: return self.parseData( ScrapEngine.connectToWebsiteWithBtnClick( self.url, self.param), self.url) except Exception as e: print("Attempt {}. Error scraping data: {}".format( str(i), str(e))) pass
def parseData(self, content, url): listMobile = [] listProduct = content.find('div', attrs={'class': 'product-list'}) allProducts = listProduct.findAll('div', attrs={'class': 'list-item'}) if len(allProducts) == 0: raise NoProductFoundException for a in allProducts: image_html = ScrapEngine.hideInvalidTag(a.find('img'), ['strike']) name_html = ScrapEngine.hideInvalidTag( a.find('div', attrs={'class': 'product-name'}), ['strike']) price_html = ScrapEngine.hideInvalidTag( a.find('div', attrs={'class': 'product-price'}), ['strike']) try: image_src = image_html['src'] name = ScrapEngine.processString(name_html.getText(), self.ignoreTerm) name_idx = name.find(" ") price = ScrapEngine.processString(price_html.getText(), self.ignoreTerm) href = "n.a" temp = name_html.find('a', href=True) href = urljoin(url, temp['href']) try: listMobile.append( PhoneData(brand=name, model="", price=price, vendor="hoanghaMobile", info={ "url": href, "img": image_src })) except PhoneDataInvalidException as error: print("Unable to parse: " + name + ": " + price + ". Error:" + str(error)) pass except Exception as e: print("Error: " + str(e)) pass print("Done with: " + url) return listMobile
def test_hideTag(self): tag = ['a', 'li', 'strike'] url = os.path.dirname( os.path.realpath(__file__)) + "/../testdata/testWebsite.html" content = open(url, encoding='utf8') soup = BeautifulSoup(content.read(), features="html.parser") result = ScrapEngine.hideInvalidTag(soup, tag) temp1 = result.findAll('a') temp2 = result.findAll('li') temp3 = result.findAll('div') self.assertEqual(0, len(temp1)) self.assertEqual(0, len(temp2)) self.assertNotEqual(0, len(temp3)) temp = "<div class=\"product-price\"><strike>33.990.000 ₫</strike> 29.890.000 ₫</div>" soup = BeautifulSoup(temp, features="html.parser") temp1 = soup.findAll('strike') self.assertEqual(1, len(temp1)) result = ScrapEngine.hideInvalidTag(soup, tag) temp1 = result.findAll('strike') self.assertEqual(0, len(temp1))
def test_processString(self): test = " hello world 213 #$%@*^)@ " ignore = ["!", "@", "#", "$", "%", "^", "&", "*", ")", "("] output = "hello world 213" self.assertEqual(output, ScrapEngine.processString(test, ignore))
def getOnePage(self, URL): return self.parseData( ScrapEngine.connectToStaticWebSite(URL, self.ignoreTerm), URL)