Example #1
0
    def parseData(self, content, url):
        listMobile = []
        listProduct = content.find('ul', attrs={'class': 'homeproduct'})
        temp = listProduct.findAll('li')
        allProducts = [x.find('a', href=True) for x in temp]
        if len(allProducts) == 0:
            raise NoProductFoundException
        for a in allProducts:
            try:
                image_html = ScrapEngine.hideInvalidTag(
                    a.find('img'), ['strike'])
                name_html = ScrapEngine.hideInvalidTag(a.find('h3'),
                                                       ['strike'])
                price_html = ScrapEngine.hideInvalidTag(
                    a.find('div', attrs={'class': 'price'}),
                    ['strike', 'span'])
                image_src = "NA"
                if 'src' in image_html.attrs:
                    image_src = image_html['src']
                elif 'data-original' in image_html.attrs:
                    image_src = image_html['data-original']
                name = ScrapEngine.processString(name_html.getText(),
                                                 self.ignoreTerm)
                name_idx = name.find(" ")

                price = ScrapEngine.processString(price_html.getText(),
                                                  self.ignoreTerm)
                href = "n.a"
                href = urljoin(url, a['href'])
                try:
                    listMobile.append(
                        PhoneData(brand=name,
                                  model="",
                                  price=price,
                                  vendor="thegioididong",
                                  info={
                                      "url": href,
                                      "img": image_src
                                  }))
                except PhoneDataInvalidException as error:
                    print("Unable to parse: " + name + ": " + price +
                          ". Error:" + str(error))
                    pass
            except Exception as e:
                print("Error: " + str(e))
                pass
        print("Done with: " + url)
        print("Found {} items".format(str(len(listMobile))))
        return listMobile
Example #2
0
 def test_connecttoWebsite(self):
     url = os.path.dirname(
         os.path.realpath(__file__)) + "/../testdata/testWebsite.html"
     ignoreTerm = ["Chính hãng", "Chính Hãng", "-"]
     try:
         result = ScrapEngine.connectToStaticWebSite(url, ignoreTerm)
         self.assertIsNotNone(result, "Beautiful soup result is none")
     except:
         self.fail("Unable to connect to website")
Example #3
0
 def getAllPages(self):
     for i in range(3):
         try:
             return self.parseData(
                 ScrapEngine.connectToWebsiteWithBtnClick(
                     self.url, self.param), self.url)
         except Exception as e:
             print("Attempt {}. Error scraping data: {}".format(
                 str(i), str(e)))
             pass
Example #4
0
    def parseData(self, content, url):
        listMobile = []
        listProduct = content.find('div', attrs={'class': 'product-list'})
        allProducts = listProduct.findAll('div', attrs={'class': 'list-item'})
        if len(allProducts) == 0:
            raise NoProductFoundException
        for a in allProducts:
            image_html = ScrapEngine.hideInvalidTag(a.find('img'), ['strike'])
            name_html = ScrapEngine.hideInvalidTag(
                a.find('div', attrs={'class': 'product-name'}), ['strike'])
            price_html = ScrapEngine.hideInvalidTag(
                a.find('div', attrs={'class': 'product-price'}), ['strike'])
            try:
                image_src = image_html['src']
                name = ScrapEngine.processString(name_html.getText(),
                                                 self.ignoreTerm)
                name_idx = name.find(" ")

                price = ScrapEngine.processString(price_html.getText(),
                                                  self.ignoreTerm)
                href = "n.a"
                temp = name_html.find('a', href=True)
                href = urljoin(url, temp['href'])
                try:
                    listMobile.append(
                        PhoneData(brand=name,
                                  model="",
                                  price=price,
                                  vendor="hoanghaMobile",
                                  info={
                                      "url": href,
                                      "img": image_src
                                  }))
                except PhoneDataInvalidException as error:
                    print("Unable to parse: " + name + ": " + price +
                          ". Error:" + str(error))
                    pass
            except Exception as e:
                print("Error: " + str(e))
                pass
        print("Done with: " + url)
        return listMobile
Example #5
0
    def test_hideTag(self):
        tag = ['a', 'li', 'strike']
        url = os.path.dirname(
            os.path.realpath(__file__)) + "/../testdata/testWebsite.html"
        content = open(url, encoding='utf8')
        soup = BeautifulSoup(content.read(), features="html.parser")
        result = ScrapEngine.hideInvalidTag(soup, tag)
        temp1 = result.findAll('a')
        temp2 = result.findAll('li')
        temp3 = result.findAll('div')
        self.assertEqual(0, len(temp1))
        self.assertEqual(0, len(temp2))
        self.assertNotEqual(0, len(temp3))

        temp = "<div class=\"product-price\"><strike>33.990.000 ₫</strike> 29.890.000 ₫</div>"
        soup = BeautifulSoup(temp, features="html.parser")
        temp1 = soup.findAll('strike')
        self.assertEqual(1, len(temp1))
        result = ScrapEngine.hideInvalidTag(soup, tag)
        temp1 = result.findAll('strike')
        self.assertEqual(0, len(temp1))
Example #6
0
 def test_processString(self):
     test = "     hello world 213 #$%@*^)@    "
     ignore = ["!", "@", "#", "$", "%", "^", "&", "*", ")", "("]
     output = "hello world 213"
     self.assertEqual(output, ScrapEngine.processString(test, ignore))
Example #7
0
 def getOnePage(self, URL):
     return self.parseData(
         ScrapEngine.connectToStaticWebSite(URL, self.ignoreTerm), URL)