def extract(self, target): page = self.get_page(target) if page is None: return logger.info('start to extract sub category in %s', target) category_list = [] for category_item in page.find_all('li', 'category-item'): for anchor in category_item.find_all('a'): name = to_unicode(anchor.string) if name is None: name = to_unicode(name.strings) name = re.sub(r'\xa0.+', '', name) url = to_unicode(anchor['href']) category_list.append((name, url)) write_csv(SUB_CATEGORY_CSV, 'ab', category_list) logger.info('finished extracting sub category in %s', target)
def extract_top_category(self): target = '/browse' page = self.get_page(target) if page is None: return logger.info('start to extract top category in %s', target) category_url_list = [] for category_item in page.find_all('li', 'category-item'): anchor = category_item.a if anchor is None: continue category_name = to_unicode(anchor.string) category_url = to_unicode(anchor['href']) if category_name and category_url: category_url_list.append((category_name, category_url)) if len(category_url_list) == 0: logger.error('no category found') return write_csv(TOP_CATEGORY_CSV, 'wb', category_url_list) logger.info("finished extracting top category into '%s'", TOP_CATEGORY_CSV)