Example #1
0
 def __init__(self, transformed_dir, loaded_dir):
     """ (Re)create main directory """
     self.transformed_dir = Directory("working_dir",
                                      transformed_dir,
                                      init=False)
     self.loaded_dir = Directory("working_dir", loaded_dir)
     self.browse()
Example #2
0
 def __init__(self, extracted_dir, transformed_dir):
     self.extracted_dir = Directory("working_dir",
                                    extracted_dir,
                                    init=False)
     self.transformed_dir = Directory("working_dir", transformed_dir)
     self.reg_int = re.compile(r"\d+")  # Like an integer
     self.reg_dec = re.compile("([0-9]+)\.([0-9]+)")  # Like a decimal
     self.rating = {
         'One': '1/5',
         'Two': '2/5',
         'Three': '3/5',
         'Four': '4/5',
         'Five': '5/5'
     }
     self.browse()
Example #3
0
 def write(self, category_name, products_dict):
     """ Create category's dir, browse products and, for each, write image """
     category_images_dir = self.loaded_dir.path(category_name)
     Directory(category_images_dir)
     for product in products_dict:
         image_url = product[IMAGE_URL]
         image_file = image_url.split('/')
         # Extract last item (name of the image + '.jpg')
         image_file = image_file[len(image_file) - 1]
         # Absolute path of the file
         image_file = self.loaded_dir.path(category_name, image_file)
         request.urlretrieve(image_url, image_file)
Example #4
0
class Load:
    """ Read the transformed datas and make csv files, one for each category """
    def __init__(self, transformed_dir, loaded_dir):
        """ (Re)create main directory """
        self.transformed_dir = Directory("working_dir",
                                         transformed_dir,
                                         init=False)
        self.loaded_dir = Directory("working_dir", loaded_dir)
        self.browse()

    def browse(self):
        """ For each category, read the .json file, create a .csv file """
        for category_file in self.transformed_dir.listdir():
            category_file = self.transformed_dir.path(category_file)
            with open(category_file, 'r', encoding='utf-8') as json_file:
                category_dict = json.load(json_file)
                category_name = list(category_dict)[
                    0]  # Category's name is in first key
                products_dict = category_dict[category_name]
                self.write(category_name, products_dict)

    def write(self, category_name, products_dict):
        pass
Example #5
0
class ExtractMain(Extract):
    """ Extract contents of the first page and browse the categories """
    def create_dir(self):
        """ Remove / (re)create main directory : extracted. """
        self.extracted_dir = Directory("working_dir", self.extracted_dir)

    def browse(self):
        """  Browse categories """

        # Extract categories from the page : label and url
        categories = self.soup.select(
            'div.side_categories ul.nav.nav-list li ul li a')
        for result in categories:
            url_category = self.urlpath(result['href'])
            category = result.get_text().strip()
            extracted_dir = self.extracted_dir.path(category)
            ExtractCategory(url_category, extracted_dir)
Example #6
0
class Transform:
    """ Transform contents of a product's page """
    def __init__(self, extracted_dir, transformed_dir):
        self.extracted_dir = Directory("working_dir",
                                       extracted_dir,
                                       init=False)
        self.transformed_dir = Directory("working_dir", transformed_dir)
        self.reg_int = re.compile(r"\d+")  # Like an integer
        self.reg_dec = re.compile("([0-9]+)\.([0-9]+)")  # Like a decimal
        self.rating = {
            'One': '1/5',
            'Two': '2/5',
            'Three': '3/5',
            'Four': '4/5',
            'Five': '5/5'
        }
        self.browse()

    def browse(self):
        """ Browse categories, products. Grouped by category, give transformed products """

        for category_name in self.extracted_dir.listdir():
            category = {}
            products = []
            category[category_name] = products
            category_extracted_dir = self.extracted_dir.path(category_name)
            for product_name in listdir(category_extracted_dir):
                product_file = self.extracted_dir.path(category_extracted_dir,
                                                       product_name)
                products.append(
                    self.transform_product(product_file, category_name,
                                           product_name))

            category_file = self.transformed_dir.path(category_name + ".json")
            with open(category_file, 'w', encoding='utf-8') as json_file:
                json.dump(category, json_file)

    def transform_product(self, product_file, category_name, product_name):
        """ Get html contents, give pertinent datas """
        with open(product_file, "r", encoding='utf-8') as f:
            contents = f.read()
            soup = BeautifulSoup(contents, "html.parser")

        elements = {}
        elements[TITLE] = soup.find('h1').get_text()
        elements[PRODUCT_PAGE_URL] = 'http:' + product_name.replace(
            SEPARATOR, '/')  # File name give url
        trs = soup.find_all('tr')
        elements[UNIVERSAL_PRODUCT_CODE] = trs[0].td.get_text()
        elements[CATEGORY] = category_name
        elements[PRICE_INCLUDING_TAX] = self.reg_dec.search(
            trs[2].td.get_text())[0]
        elements[PRICE_EXCLUDING_TAX] = self.reg_dec.search(
            trs[3].td.get_text())[0]
        elements[NUMBER_AVAILABLE] = self.reg_int.search(
            trs[5].td.get_text())[0]
        elements[IMAGE_URL] = urljoin(URL_TO_SCRAP, soup.find('img')['src'])
        elements[PRODUCT_DESCRIPTION] = soup.find_all('p')[3].get_text()
        star_rating = soup.find('p', attrs={
            'class': 'star-rating'
        }).attrs['class'][1]
        elements[REVIEW_RATING] = self.rating[
            star_rating]  # Transform star rating in ratio

        return elements
Example #7
0
 def create_dir(self):
     """ Create category's directory. """
     self.extracted_dir = Directory(self.extracted_dir)
Example #8
0
 def create_dir(self):
     """ Remove / (re)create main directory : extracted. """
     self.extracted_dir = Directory("working_dir", self.extracted_dir)