def getCoursesCards(soup, isSetCategories=False): selector = 'div.row.course-list.list > div' cards = soup.select(selector) data = [] for card in cards: data.append(extractCourseCard(card)) if isSetCategories: s = Scraper() categoriesUrls = getCategoriesURLs(soup) for category in categoriesUrls: name = category.get('name', '') url = category.get('url') s.get(url) tempSoup = s.html_soup() categoryCards = getCoursesCards(tempSoup) ids = [c['publicId'] for c in categoryCards] for course in data: if course['publicId'] in ids: if course.get('categories', None): course['categories'].append(name) else: course['categories'] = [name] return data
def getTheGoodZoneDataAndMyPathes(): s = Scraper() coursesUrl = f'{protocol}://{domain}/courses' s.get(coursesUrl) coursesSoup = s.html_soup() blogsUrl = f'{protocol}://{domain}/blog' s.get(blogsUrl) blogsSoup = s.html_soup() return [ { 'delPath': '/delete/courses/', 'setPath': '/set/course/', 'items': getCourses() }, { 'delPath': '/delete/coaches/', 'setPath': '/set/coach/', 'items': getCoaches(coursesSoup)['coach'] }, { 'delPath': '/delete/instructors/', 'setPath': '/set/instructor/', 'items': getCoaches(coursesSoup)['course'] }, { 'delPath': '/delete/live-events/', 'setPath': '/set/live-event/', 'items': getCoaches(coursesSoup)['live event'] }, { 'setPath': '/set-course-description/', 'items': getCoaches(coursesSoup)['description'] }, { 'delPath': '/delete/blogs/', 'setPath': '/set/blog/', 'items': getBlogs(blogsSoup) }, ]
def extraBlogData(url): s = Scraper() s.get(url) soup = s.html_soup() data = { 'publicId': url.split('/')[4], } thumbnailElm = soup.select_one('.post-content img') data.update({ 'thumbnail': thumbnailElm['src'] if thumbnailElm else None, }) return data
def getCoachingCards(soup): selector = 'div.row.services-row.list > div' cards = soup.select(selector) data = { 'total': [], 'coach': [], 'live event': [], 'course': [], 'description': [] } for card in cards: data['total'].append(extractCoachingCard(card)) s = Scraper() # print(data['total']) for cardData in data['total']: url = cardData.get('url') s.get(url) tempSoup = s.html_soup() descriptionContentELm = tempSoup.select_one( '.course-block.custom_html') # print('Desc: ', descriptionContentELm.text) types = ['description', 'course', 'coach', 'live event'] if not (descriptionContentELm and any([t in descriptionContentELm.text for t in types])): continue descriptionContent = descriptionContentELm.text cardData.update(parseCoaching(descriptionContent)) card = handleCoachingCardData(cardData) data[card.pop('type')].append(card) data.pop('total') return data
def getCourses(): # print('get all courses') allCourses = [] courses = [] first = True s = Scraper() pageNum = 1 while courses or first: first = False allCourses.extend(courses) coursesUrl = f'{protocol}://{domain}/courses?page={pageNum}' # print('\n\n\nurl: ', coursesUrl, '\n\n\n\n') s.get(coursesUrl) coursesSoup = s.html_soup() courses = getCoursesCards(coursesSoup, isSetCategories=True) pageNum += 1 return allCourses