def __init__(self, transformed_dir, loaded_dir): """ (Re)create main directory """ self.transformed_dir = Directory("working_dir", transformed_dir, init=False) self.loaded_dir = Directory("working_dir", loaded_dir) self.browse()
def __init__(self, extracted_dir, transformed_dir): self.extracted_dir = Directory("working_dir", extracted_dir, init=False) self.transformed_dir = Directory("working_dir", transformed_dir) self.reg_int = re.compile(r"\d+") # Like an integer self.reg_dec = re.compile("([0-9]+)\.([0-9]+)") # Like a decimal self.rating = { 'One': '1/5', 'Two': '2/5', 'Three': '3/5', 'Four': '4/5', 'Five': '5/5' } self.browse()
def write(self, category_name, products_dict): """ Create category's dir, browse products and, for each, write image """ category_images_dir = self.loaded_dir.path(category_name) Directory(category_images_dir) for product in products_dict: image_url = product[IMAGE_URL] image_file = image_url.split('/') # Extract last item (name of the image + '.jpg') image_file = image_file[len(image_file) - 1] # Absolute path of the file image_file = self.loaded_dir.path(category_name, image_file) request.urlretrieve(image_url, image_file)
class Load: """ Read the transformed datas and make csv files, one for each category """ def __init__(self, transformed_dir, loaded_dir): """ (Re)create main directory """ self.transformed_dir = Directory("working_dir", transformed_dir, init=False) self.loaded_dir = Directory("working_dir", loaded_dir) self.browse() def browse(self): """ For each category, read the .json file, create a .csv file """ for category_file in self.transformed_dir.listdir(): category_file = self.transformed_dir.path(category_file) with open(category_file, 'r', encoding='utf-8') as json_file: category_dict = json.load(json_file) category_name = list(category_dict)[ 0] # Category's name is in first key products_dict = category_dict[category_name] self.write(category_name, products_dict) def write(self, category_name, products_dict): pass
class ExtractMain(Extract): """ Extract contents of the first page and browse the categories """ def create_dir(self): """ Remove / (re)create main directory : extracted. """ self.extracted_dir = Directory("working_dir", self.extracted_dir) def browse(self): """ Browse categories """ # Extract categories from the page : label and url categories = self.soup.select( 'div.side_categories ul.nav.nav-list li ul li a') for result in categories: url_category = self.urlpath(result['href']) category = result.get_text().strip() extracted_dir = self.extracted_dir.path(category) ExtractCategory(url_category, extracted_dir)
class Transform: """ Transform contents of a product's page """ def __init__(self, extracted_dir, transformed_dir): self.extracted_dir = Directory("working_dir", extracted_dir, init=False) self.transformed_dir = Directory("working_dir", transformed_dir) self.reg_int = re.compile(r"\d+") # Like an integer self.reg_dec = re.compile("([0-9]+)\.([0-9]+)") # Like a decimal self.rating = { 'One': '1/5', 'Two': '2/5', 'Three': '3/5', 'Four': '4/5', 'Five': '5/5' } self.browse() def browse(self): """ Browse categories, products. Grouped by category, give transformed products """ for category_name in self.extracted_dir.listdir(): category = {} products = [] category[category_name] = products category_extracted_dir = self.extracted_dir.path(category_name) for product_name in listdir(category_extracted_dir): product_file = self.extracted_dir.path(category_extracted_dir, product_name) products.append( self.transform_product(product_file, category_name, product_name)) category_file = self.transformed_dir.path(category_name + ".json") with open(category_file, 'w', encoding='utf-8') as json_file: json.dump(category, json_file) def transform_product(self, product_file, category_name, product_name): """ Get html contents, give pertinent datas """ with open(product_file, "r", encoding='utf-8') as f: contents = f.read() soup = BeautifulSoup(contents, "html.parser") elements = {} elements[TITLE] = soup.find('h1').get_text() elements[PRODUCT_PAGE_URL] = 'http:' + product_name.replace( SEPARATOR, '/') # File name give url trs = soup.find_all('tr') elements[UNIVERSAL_PRODUCT_CODE] = trs[0].td.get_text() elements[CATEGORY] = category_name elements[PRICE_INCLUDING_TAX] = self.reg_dec.search( trs[2].td.get_text())[0] elements[PRICE_EXCLUDING_TAX] = self.reg_dec.search( trs[3].td.get_text())[0] elements[NUMBER_AVAILABLE] = self.reg_int.search( trs[5].td.get_text())[0] elements[IMAGE_URL] = urljoin(URL_TO_SCRAP, soup.find('img')['src']) elements[PRODUCT_DESCRIPTION] = soup.find_all('p')[3].get_text() star_rating = soup.find('p', attrs={ 'class': 'star-rating' }).attrs['class'][1] elements[REVIEW_RATING] = self.rating[ star_rating] # Transform star rating in ratio return elements
def create_dir(self): """ Create category's directory. """ self.extracted_dir = Directory(self.extracted_dir)
def create_dir(self): """ Remove / (re)create main directory : extracted. """ self.extracted_dir = Directory("working_dir", self.extracted_dir)