def _get_text(tag: bs4.element.Tag) -> str: """ Get pretty text from example and remove from there duplicate spaces. Here it is assumed, that all examples have text. """ # using 'findall' method removes punctuation marks txt = tag.get_text() # remove duplicate spaces return clean_text_up(txt)
def is_content_entry(tag: bs4.element.Tag) -> bool: # children = [ # c for c in tag.children # if not (isinstance(c, str) and re.match(r' +', c)) # ] # children = list(tag.children) # return ( # tag.name == 'span' # and 'W' in tag.attrs.['class'] # and len(children) >= 2 # and isinstance(children[-1], bs4.element.Tag) # and all(isinstance(elem, str) for elem in children[:-1]) # and children[-1].name == 'a' # ) return (isinstance(tag, bs4.element.Tag) and tag.name == 'span' and all(not cls.startswith('bg') for cls in tag.attrs['class']) and any((c in tag.attrs['class']) for c in ['W', 'C']) and not re.fullmatch(' *', tag.get_text()))
async def get_lessons_from_soup(self, element: bs4.element.Tag, groups_list: List[str]): try: title = element['title'] except KeyError: return title_list = title.split(" ") try: group = str(re.findall(r'\?.+?\?', title)[0]).replace("?", '') except IndexError: return group_db = await select_group(group) if not group or not group_db: return try: quantity = int([s for s in title_list if "||" in s][0].split("||")[1]) except IndexError: return quantity = 1 if quantity == 0 else quantity date_num = [s for s in title_list if "~" in s][0] date = date_num.split("~")[0] lesson_num = int(date_num.split("~")[1]) month = int(date.split(".")[1].lstrip("0")) day = int(date.split(".")[0].lstrip("0")) day_week = Week[datetime(datetime.now().year, month, day).strftime('%A').lower()] teacher_id = await self.is_prepod_in_db(element.get_text()) if teacher_id: text = "".join(element.get_text().replace( "\t", "").strip().split("\n")[:-1]) else: text = " ".join(element.get_text().split()) if not text: return text = " ".join(text.split()) subgroup: int = 0 lesson_kind: LessonKind = LessonKind.lec colspan = int(element['colspan']) if colspan > group_db.subgroups: i = 0 j = 0 quantity = 0 while i <= colspan: idx = groups_list.index(group) + j if idx >= len(groups_list): break gr = groups_list[idx] gr_info = await select_group(gr) i += gr_info.subgroups if i <= colspan: quantity += 1 j += 1 elif 0 < colspan < group_db.subgroups: lesson_kind = LessonKind.lab subgroup = 1 if 'pr' in title else 2 else: lesson_kind = LessonKind.lec if 'l1' in title else LessonKind.prac if element['rowspan'] == '#': week = UnderAboveWeek.under if 'tp' in title else UnderAboveWeek.above else: week = UnderAboveWeek.all group_idx = groups_list.index(group) result: List[List] = [] for i in range(group_idx, group_idx + quantity): result.append([ day_week, lesson_num, week, groups_list[i], subgroup, text, lesson_kind, teacher_id ]) return result