def parse(self, data): soup = BeautifulSoup(data, 'html.parser') obj = soup.find('span', {'class': "header-profile-login"}) if not obj: raise Parser.IncorrectFormat(data) name = obj.text.strip() object_list = soup.find_all('a') if not object_list: raise Parser.IncorrectFormat(data) num_books = 0 for obj in object_list: m = re.fullmatch(r"Книги\s*(\d+)\s*", obj.text) if m: num_books = int(m.group(1)) break object_list = soup.find_all('div', {'class': "group-row-title"}) if not object_list: raise Parser.IncorrectFormat(data) birth = None death = None for obj in object_list: if not birth: m = re.fullmatch(r"(?:Родился|Родилась):\s*(.*)", obj.text) if m: birth = m.group(1) if not death: m = re.fullmatch(r"(?:Умер|Умерла):\s*(.*)", obj.text) if m: death = m.group(1) if birth and death: break birth_date, birth_place = self.parseDate(birth) birth_place = re.sub(r'\s+', ' ', birth_place).strip() death_date, death_place = self.parseDate(death) death_place = re.sub(r'\s+', ' ', death_place).strip() obj = soup.find('span', { 'class': "stats-item marg-right", 'title': 'Почитатели творчества' }) adepts = int(obj.text if obj is not None else 0) obj = soup.find('span', { 'class': "stats-item marg-right", 'title': 'Читателей' }) readers = int(obj.text if obj is not None else 0) return [ name, birth_date, birth_place, death_date, death_place, num_books, adepts, readers]
def parse(self, data): """ Parses html text and extracts field values :param data: html text (page) :return: a list of urls with author data plus continuation flag """ soup = BeautifulSoup(data, 'html.parser') # extract href from # <a class=\"arow-name c-black\" href=\"\/author\/30230\">...</a> object_list = soup.find_all('a', {'class': 'arow-name c-black'}) if not object_list: raise Parser.IncorrectFormat(data) return [x.get('href') for x in object_list]