def get_auth(self, force=False): """ Retrieve sessionid to retrieve content (using our own account) Is cached via memcached for 1day @param boolean force - [False] Force cache refresh @return string - sessionid """ cache_key = "login" session = mc.get(cache_key) if force or session == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: session = mc.get(cache_key) if session: return session['sessionid'] print('GET ' + cache_key) session = self.login("66b1d91e8e", "66b1d91e8e66b1d91e8e!") mc.set(cache_key, session, time=60 * 60 * 24) return session['sessionid']
def courses(self, lang, page=1, cat="", query=""): """ Retrieve the list of courses for the given language, category, query string and page Is cached via memcached for 24hours (except if query != "") @param string lang @param integer[optional] page - [1] @param string[optional] cat - [""] @param string[optional] query - [""] @return string - Retrieved JSON """ if not isinstance(page, int) and not page.isdigit(): page = 0 # Check cache if query != "": cache_key = False courses = None else: cache_key = lang + '_courses_' + str(page) + '_' + cat courses = mc.get(cache_key) # Query memrise if courses == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: courses = mc.get(cache_key) if courses: return courses if cache_key: print('GET ' + cache_key) url = 'https://app.memrise.com/ajax/browse/?s_cat=' + lang if cat != "": url += "&cat=" + cat if query != "": url += "&q=" + query url += '&page=' + str(page) + '&_=' + get_time() courses = requests.get(url, headers={ "Accept-Language": "fr;q=0.8,en-US;q=0.5,en;q=0.3" }).text if cache_key: mc.set(cache_key, courses, time=60 * 60 * 24) return courses
def level_multimedia(self, urlCourse, lvl): """ Retrieve the content of a multimedia level Is cached via memcached for 24hours @throws requests.exceptions.HTTPError @param string urlCourse - ex "/course/43238/durham-university-medicine-year-one/" @param integer lvl @return dict - Retrieved JSON """ pattern = re.search("/course/(\d+)/", urlCourse) if pattern: idCourse = pattern.group(1) else: return False cache_key = "course_" + idCourse + "_" + lvl + "_multimedia" data = mc.get(cache_key) if data == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: data = mc.get(cache_key) if data: return data url = "https://app.memrise.com" + urlCourse + lvl + "/" response = requests.get(url) response.raise_for_status() # Get response html = response.text.encode('utf-8').strip() DOM = BeautifulSoup(html, "html5lib", from_encoding='utf-8') data = False # Look for value of js variable "level_multimedia" scripts = DOM.html.body.find_all("script", recursive=False) for script in scripts: text = script.text.strip() if text and text.startswith("var level_multimedia = "): data = text[23:].strip(';') break mc.set(cache_key, data, time=60 * 60 * 24) return data
def categories(self, lang): """ Retrieve the list of categories that have courses for the given language Is cached via memcached for 24hours @param string lang @return dict - {<idCourse>: True} """ cache_key = lang + "_categories" categories = mc.get(cache_key) # Query memrise if categories == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: categories = mc.get(cache_key) if categories: return categories print('GET ' + cache_key) html = requests.get("https://app.memrise.com/fr/courses/" + lang + "/").text.encode('utf-8').strip() # Parse HTML DOM = BeautifulSoup(html, "html5lib", from_encoding='utf-8') ul = DOM.find_all('ul', {'class': 'categories-list'}).pop() def parseCategories(ul): for li in ul.findChildren(): if not 'data-category-id' in li.attrs: continue id = li.attrs['data-category-id'] categories[id] = True if li.ul: parseCategories(li.ul) categories = {} parseCategories(ul) mc.set(cache_key, categories, time=60 * 60 * 24) return categories
def user_courses(self, tab, username): """ Retrieve the courses of an user Is cached via memcached for 1hour @throws requests.exceptions.HTTPError @param string tab - teaching | learning @param string username @return dict - {content, nbCourse} """ cache_key = "user_" + username + "_" + tab courses = mc.get(cache_key) if courses == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: courses = mc.get(cache_key) if courses: return courses print('GET ' + cache_key) response = requests.get("https://app.memrise.com/user/" + username + "/courses/" + tab + "/") response.raise_for_status() html = response.text.encode('utf-8').strip() DOM = BeautifulSoup(html, "html5lib", from_encoding='utf-8') courses = {"nbCourse": 0, "content": []} # Get list of courses div = DOM.find(id="content") if div != "None": content = div.find_all("div", {"class": "course-box-wrapper"}) for wrapper in content: courses["content"].append(str(wrapper)) courses["nbCourse"] += 1 mc.set(cache_key, courses, time=60 * 60) return courses
def leaderboard(self, idCourse, period): """ Retrieve the learderboard of a course (50 first) Is cached via memcached for 1hour @throws requests.exceptions.HTTPError @param integer idCourse @param string period - month, week, alltime @return dict - Retrieved JSON """ cache_key = "course_" + idCourse + "_learderboard_" + period ldboard = mc.get(cache_key) if ldboard == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: ldboard = mc.get(cache_key) if ldboard: return ldboard sessionid = self.get_auth() print('GET ' + cache_key) url = "https://app.memrise.com/ajax/leaderboard/course/" + idCourse + "/?period=" + period + "&how_many=50" response = requests.get(url, cookies={"sessionid_2": sessionid}) # Try reauthenticate if response.status_code == 403: sessionid = self.get_auth(True) response = requests.get(url, cookies={"sessionid_2": sessionid}) response.raise_for_status() ldboard = response.json() mc.set(cache_key, ldboard, time=60 * 60 * 24) return ldboard
def _user_mempals(self, mempals, username, page=1): """ Retrieve the list of followers of a user or followed users Is cached via memcached for 1hour @throws requests.exceptions.HTTPError @param string mempals - followers following @param string username @param integer page - [1] @return dict - {page, lastpage, users} """ if not isinstance(page, int): if page.isdigit(): page = int(page) else: page = 1 cache_key = "user_" + username + "_" + mempals cache_paging = True data = mc.get(cache_key) # Check we dont cache the last page multiple times if data != None: cache_paging = False if page > data: page = data data = mc.get(cache_key + "_" + str(page)) # Get the given page cache_key_page = cache_key + "_" + str(page) if data == None: with mc.lock(cache_key_page) as retries: # Check if we set memcached while we were waiting for the lock if retries: data = mc.get(cache_key_page) if data: return data print('GET ' + cache_key_page) response = requests.get("https://app.memrise.com/user/" + username + "/mempals/" + mempals + "/?page=" + str(page)) response.raise_for_status() html = response.text.encode('utf-8').strip() DOM = BeautifulSoup(html, "html5lib", from_encoding='utf-8') data = {"page": page, "lastpage": 0, "users": []} # Get list of followers div = DOM.find(id="content") if div != None: users = div.find_all(attrs={'class': 'user-box'}) for user in users: username = user.find(attrs={'class': 'username'}) img = user.find('img') if username == None: continue item = { "name": username.text.strip(), "photo": img.attrs['src'] if img else "" } data["users"].append(item) # Get current page + max page number div = DOM.find('ul', {'class': 'pagination'}) currentPage = page lastpage = 0 if div != None: for child in div.children: if not isinstance(child, Tag): continue text = child.text.strip() if not re.match('[0-9]+', text): continue lastpage = int(text) if 'class' in child.attrs and 'active' in child.attrs[ 'class']: currentPage = lastpage data['page'] = currentPage data['lastpage'] = lastpage if cache_paging: mc.set(cache_key, data['lastpage'], time=60 * 60) mc.set(cache_key + '_' + str(currentPage), data, time=60 * 60) data['has_next'] = data['page'] < data['lastpage'] return data
def user(self, username, force=False): """ Retrieve the info about a user Is cached via memcached for 1hour @throws requests.exceptions.HTTPError @param string username @param boolean[optional] force - [false] Get data from Memrise even if already cached @return dict - {username, photo, rank, stats} """ cache_key = "user_" + username user = None if force else mc.get(cache_key) if user == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: user = mc.get(cache_key) if user: return user print('GET ' + cache_key) response = requests.get("https://app.memrise.com/user/" + username + "/courses/teaching/") response.raise_for_status() html = response.text.encode('utf-8').strip() DOM = BeautifulSoup(html, "html5lib", from_encoding='utf-8') user = { "username": username, "photo": "", "points": 0, "rank": 0, "stats": {} } div = DOM.find(id="page-head") if div != None: # Get avatar item = div.find('img', {'class': 'avatar'}) if item != None: user['photo'] = item.attrs['src'] # Get ponts item = div.find('img', {'class': 'profile-stats'}) if item != None: print(div.children) # Get stats (num followers, following, words, points) div = div.find(attrs={'class': 'profile-stats'}) for child in div.children: if not isinstance(child, Tag): continue text = child.text.strip() result = re.search('([0-9,]+)([\n\w ]*)', text) if result: tab = result.group(2).strip().lower() # force plural if tab == "follower": tab = "followers" elif tab == "word": tab = "words" user["stats"][tab] = result.group(1) if "points" in user["stats"]: points = int(user["stats"]["points"].replace(",", "")) print(points) rank = 0 for i, threshold in enumerate(levels): if threshold < points: rank = i else: break user["rank"] = rank + 1 div = DOM.find(id="content") if div != None: # Get nb courses item = div.find('div', {'class', 'btn-group'}) if item != None: for child in item.children: if not isinstance(child, Tag): continue result = re.search('\(([0-9,]+)\)', child.text) if result: tab = child.attrs['href'].strip('/').split( '/')[-1] user["stats"][tab] = result.group(1) mc.set(cache_key, user, time=60 * 60) return user
def level(self, idCourse, slugCourse, lvl, slug="preview", sessionid=False): """ Retrieve the list of items of a level (wont work for multimedia) Is cached via memcached for 24hours if sessionid isn't provided @throws requests.exceptions.HTTPError @param integer idCourse @param integer|string lvl - index | "all" @param string slug @param string session @return dict - Retrieved JSON """ if slug == "speed_review": slug = "classic_review" if sessionid: user_session = True cache_key = False level = None else: user_session = False cache_key = "course_" + idCourse + "_" + lvl + "_" + slug level = mc.get(cache_key) if level == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: level = mc.get(cache_key) if level: return level if not sessionid: sessionid = self.get_auth() print('GET ' + cache_key) url = "https://app.memrise.com/ajax/session/?course_id=" + idCourse if lvl != "all": url += "&level_index=" + lvl url += "&session_slug=" + slug if slug != "preview": url += "&_=" + get_time() response = requests.get(url, cookies={"sessionid_2": sessionid}) # Try reauthenticate if user_session == False and response.status_code == 403: sessionid = self.get_auth(True) response = requests.get(url, cookies={"sessionid_2": sessionid}) response.raise_for_status() level = response.json() if user_session and slug != "preview": url = "https://app.memrise.com/course/" + idCourse + "/" + slugCourse + "/garden/" + slug + "/" response = requests.head( url, cookies={"sessionid_2": sessionid}) response.raise_for_status() level['referer'] = url level['csrftoken'] = response.cookies.get('csrftoken') if cache_key: mc.set(cache_key, level, time=60 * 60 * 24) return level
def course(self, id, sessionid=False): """ Retrieve the info about a course Is cached via memcached for 24hours @throws requests.exceptions.HTTPError @param integer id @return dict - {id, title, url, author, description, photo, levels, breadcrumb} """ if sessionid: cache_key = False course = None else: cache_key = "course_" + id course = mc.get(cache_key) if course == None: with mc.lock(cache_key) as retries: # Check if we set memcached while we were waiting for the lock if retries: course = mc.get(cache_key) if course: return course if sessionid: response = requests.get("https://app.memrise.com/course/" + id, cookies={"sessionid_2": sessionid}) else: print('GET ' + cache_key) sessionid = self.get_auth() response = requests.get("https://app.memrise.com/course/" + id, cookies={"sessionid_2": sessionid}) response.raise_for_status() html = response.text.encode('utf-8').strip() # Parse HTML DOM = BeautifulSoup(html, "html5lib", from_encoding='utf-8') course = { "id": id, "title": "", "url": "", "author": "", "description": "", "photo": "", "levels": {}, "breadcrumb": [] } div = DOM.find('div', {'class', 'course-wrapper'}) if div != None: # Title item = div.find(itemprop="name") if item != None: course['title'] = item.text # Description item = div.find(itemprop="about") if item != None: course['description'] = item.text # Author (only when logged in :/) item = div.find(itemprop="author") if item != None: course['author'] = item.find( itemprop="additionalName").text # Categories item = div.find('div', {'class', 'course-breadcrumb'}) if item != None: for child in item.find_all('a'): cat = child.attrs['href'].strip('/').split( '/').pop() if cat in categories_code: course["breadcrumb"].append({ "id": categories_code[cat], "name": cat }) # Photo + url item = div.find('a', {'class', 'course-photo'}) if item != None: course["url"] = item.attrs['href'] course["photo"] = item.img.attrs['src'] # List of levels div = DOM.find('div', {'class': 'levels'}) if div != None: for child in div.children: if not isinstance(child, Tag): continue name = child.find('div', { 'class': 'level-title' }).text.strip() idx = child.find('div', { 'class': 'level-index' }).text.strip() ico = child.find(attrs={ 'class': 'level-ico' }).attrs['class'].pop() course["levels"][idx] = { "name": name, "type": (2 if ico == 'level-ico-multimedia-inactive' or ico == 'level-ico-multimedia' else 1) } if sessionid: status = child.find('div', {'class': 'level-status'}) if status != None: course["levels"][idx]["status"] = re.sub( "\s+", " ", str(status)) if sessionid: stats = self._course_progress(DOM) if stats != None: course['stats'] = stats if cache_key: mc.set(cache_key, course, time=60 * 60 * 24) return course