def _parse_one_block(self, block: bs4.element.Tag) -> Dict: _result = block.find("div", class_=self._class_res).text.strip() _stage, _rewards = [ i.text.strip() for i in block.find_all("div", attrs={"style": self._style_font_m}) ] _type, _time, _map = [ i.text.strip() for i in block.find_all("div", attrs={"style": self._style_font_s}) ] people = block.find_all("a") _people = [ self._parse_one_person(person, _stage, i) for i, person in enumerate(people) ] _people = self._assign_teammates(_people, _stage) return { "match": _result, "stage": _stage, "map": _map, "rewards": int(_rewards), "type": _type, "time": _time, "players": _people, }
def calculate_score(self, star_display: bs4.element.Tag) -> int: """Calculate numerical score from number of star-display elements. Args: star_display (bs4.element.Tag): RT website class for star-display's Raises: TypeError: Raised if input is not of type bs4.element.Tag TypeError: Raised if tag is not of class 'star-display' Returns: int: Numerical score. """ if not isinstance(star_display, bs4.element.Tag): raise TypeError("Input must be of type bs4.element.Tag") if not star_display["class"][0] == "star-display": raise TypeError("Tag must be of class 'star-display'") full_star_count = len( star_display.find_all(class_="star-display__filled", recursive=False) ) half_star_count = len( star_display.find_all(class_="star-display__half", recursive=False) ) # NOTE: Not too sure if BS4 has inbuilt function for this score = full_star_count + (half_star_count * 0.5) return score
def dblp_contribs( self, elem: bs4.element.Tag ) -> List[fatcat_openapi_client.ReleaseContrib]: """ - author (multiple; each a single string) => may have HTML entities => may have a number at the end, to aid with identifier creation => orcid - editor (same as author) => orcid """ contribs = [] index = 0 for elem in elem.find_all("author"): contrib = self.dblp_contrib_single(elem) contrib.role = "author" contrib.index = index contribs.append(contrib) index += 1 for elem in elem.find_all("editor"): contrib = self.dblp_contrib_single(elem) contrib.role = "editor" contribs.append(contrib) return contribs
def extracting_extension_data(self, soup: bs4.element.Tag): """ 拡張機能により追記したDOM要素を除き、別の変数に格納する 専用HTMLに情報を載せることにした """ # classがRequestとDownloadの要素を集める request_elements: list[ResultSet] = soup.find_all( 'p', attrs={'class': 'Request'}) script_elements: Iterable[ResultSet] = soup.find_all( 'p', attrs={'class': 'Script'}) download_elements: list[ResultSet] = soup.find_all( 'p', attrs={'class': 'Download'}) history_elements: list[ResultSet] = soup.find_all( 'p', attrs={'class': 'History'}) # リクエストURLを集合に追加し、同じサーバ内のURLはまるまる保存、それ以外はホスト名だけ保存 self.request_url_from_ex: Set[str] = set( [elm.get_text() for elm in request_elements]) # type: ignore # 拡張機能から Script のリクエストURLを取得する self.script_url_from_ex: Set[str] = set([ fix_request_url(elm.get_text()) for elm in script_elements ] # type: ignore ) # downloadのURLを辞書のリストにし、soupの中身から削除する # download_info["数字"] = { URL, FileName, Mime, FileSize, TotalBytes, Danger, StartTime, Referrer } それぞれ辞書型 download_info: dict[str, dict[str, str]] = dict() for elm in download_elements: # type: ignore under: int = elm["id"].find("_") # type: ignore key: str = elm["id"][under + 1:] if key not in download_info: download_info[key] = dict() if elm["id"][0:under] == "JsonData": try: json_data = loads(elm.get_text()) # type: ignore except Exception as e: print("webpage.py" + str(e), flush=True) download_info[key].update({ "FileSize": "None", "TotalBytes": "None", "StartTime": "None", "Danger": "None" }) # 要素を追加しておかないと、参照時にKeyエラーが出る else: download_info[key].update(json_data) else: download_info[key][elm["id"] [0:under]] = elm.get_text() # type: ignore self.download_info = deepcopy(download_info) # URL遷移が起きた場合、記録する url_history: list[str] = [ history_element.get_text() for history_element in history_elements ] # type: ignore if len(url_history) < 2: url_history = list() self.among_url = url_history.copy()
def extract_abstract(meta: bs4.element.Tag, conf_year: int) -> str: if conf_year in [2020, 2021]: titles = meta.find_all('strong', {'class': 'note-content-field'}) # print(titles) for i, t in enumerate(titles): if t.text.lower().find('abstract') >= 0: break return flatten_content_list( meta.find_all('span', {'class': 'note-content-value'})[i].contents)
def extract_team_data(team: bs.element.Tag, st_home: bool) -> dict: home_or_visitor = 'HOME' if st_home else 'VISITOR' team_data = { "team_wiki_" + home_or_visitor: team.find_all('a')[not st_home].get('href'), "team_name_" + home_or_visitor: team.find_all('a')[not st_home].get('title'), "team_nick_" + home_or_visitor: team.find_all('a')[not st_home].text, "team_state_" + home_or_visitor: team.find_all('a')[st_home].get('title').replace(' (estado)', '') } return team_data
def _parse_example(self, tag: bs4.element.Tag) -> Any: """ Parse a pair: original – translation to Example. """ # this example is expected to have default args result_example = self.ex_type() langs = tag.find_all('td', {'class': "para-lang"}) texts = tag.find_all('li') for lang, text in zip(langs, texts): lang = lang.text.strip() new_txt = self._parse_text(lang, text) result_example += new_txt return result_example
def get_form_details(form: bs4.element.Tag) -> dict: """Return a dict containing details about the given `form`""" details = {} # get the form action action = form.attrs.get("action") # get the form method method = form.attrs.get("method", "get").lower() # get the form name name = form.attrs.get("name") # get all the input details such as type and name inputs = [] for input_tag in form.find_all("input"): input_type = input_tag.attrs.get("type") input_name = input_tag.attrs.get("name") input_value = input_tag.attrs.get("value") inputs.append({ "type": input_type, "name": input_name, "value": input_value }) selects = [] for select_tag in form.find_all("select"): select_type = select_tag.attrs.get("type") select_name = select_tag.attrs.get("name") select_value = select_tag.attrs.get("value", "") selects.append({ "type": select_type, "name": select_name, "value": select_value }) textareas = [] for textarea_tag in form.find_all("textarea"): textarea_name = textarea_tag.attrs.get("name") textarea_value = textarea_tag.attrs.get("value") textareas.append({"name": textarea_name, "value": textarea_value}) # put everything to the resulting dictionary details["name"] = name details["action"] = action details["method"] = method details["inputs"] = inputs details["selects"] = selects details["textareas"] = textareas # TODO add line number of form return details
def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict): """ Replace all references in element with special tokens :param sp: :param el: :param ref_map: :return: """ # replace all citations with cite keyword for cite in el.find_all('cit'): try: target = cite.ref.get('target').replace('bid', 'BIBREF') cite.replace_with(sp.new_string(f" {target} ")) except AttributeError: print('Attribute error: ', cite) continue # replace all non citation references for rtag in el.find_all('ref'): try: if rtag.get('target') and not rtag.get('target').startswith('bid'): if rtag.get('target').startswith('cid'): target = rtag.get('target').replace('cid', 'SECREF') elif rtag.get('target').startswith('uid'): if rtag.get('target').replace('uid', 'FIGREF') in ref_map: target = rtag.get('target').replace('uid', 'FIGREF') elif rtag.get('target').replace('uid', 'TABREF') in ref_map: target = rtag.get('target').replace('uid', 'TABREF') elif rtag.get('target').replace('uid', 'EQREF') in ref_map: target = rtag.get('target').replace('uid', 'EQREF') elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map: target = rtag.get('target').replace('uid', 'FOOTREF') elif rtag.get('target').replace('uid', 'SECREFU') in ref_map: target = rtag.get('target').replace('uid', 'SECREFU') else: target = rtag.get('target').upper() else: print('Weird ID!') target = rtag.get('target').upper() rtag.replace_with(sp.new_string(f" {target} ")) except AttributeError: print('Attribute error: ', rtag) continue return el
def table_to_data(self, table_tag: bs4.element.Tag): table_data = [] rows = table_tag.find_all("tr", recursive=False) self.parse_buy_price_cell(rows[1].find_next("td")) ix = 0 def get_sell_col_num(): col_tags = rows[0].find_all("th", recursive=False) cols_text = [col.text.strip() for col in col_tags] padding = 0 for col_tag in col_tags: col_text = col_tag.text.strip() if col_text == "Harvest" \ and 'colspan' in col_tag.attrs \ and int(col_tag['colspan']) == 2: padding = 1 self.regrowth = True break return cols_text.index("Sells For") + padding sell_column = get_sell_col_num() for row in rows[1:]: # skip header cols = row.find_all("td", recursive=False) # skip last 2 if ix == 0: cols = cols[1:sell_column+1] ix += 1 cols = [elem for elem in cols] # get only stripped text table_data.append([elem for elem in cols]) self.parse_sell_price_cell(table_data[0][sell_column-1]) self.parse_harvest_data(table_data[1][sell_column-2]) sell_tag = table_data[1][sell_column-1] for sell_match in price_re.finditer(sell_tag.text): self.gold_per_day.append(float(sell_match.group(1)))
def _get_movie_spec_rating(self, movie: bs4.element.Tag) -> Union[float, None]: """Private method to retrieve the movie rating according to the spectators. Args: movie (bs4.element.Tag): Parser results with the movie informations. Returns: Union[float, None]: The movie rating according to the spectators. """ # get all the available ratings movie_ratings = movie.find_all("div", class_="rating-item") for ratings in movie_ratings: if "Spectateurs" in ratings.text: return float( re.sub( ",", ".", ratings.find("span", { "class": "stareval-note" }).text)) return None
def __init__(self, tag: bs4.element.Tag): self.id: int = int(tag.find('td', {'class': 'id-cell'}).text) self.author: str = tag.find('td', { 'class': 'status-party-cell' }).text.strip().rstrip() parameters = tag.find_all('td', {'class': 'status-small'}) self.date: datetime = datetime.strptime( parameters[0].text.strip().rstrip(), '%d.%m.%Y %H:%M') self.problem: str = parameters[1].text.strip().rstrip() if tag is not None and tag.find( 'td', {'class': 'time-consumed-cell'}) is not None: self.time: str = tag.find('td', { 'class': 'time-consumed-cell' }).text.strip().rstrip() else: self.time: str = '' if tag is not None and tag.find( 'td', {'class': 'memory-consumed-cell'}) is not None: self.memory: str = tag.find('td', { 'class': 'memory-consumed-cell' }).text.strip().rstrip() else: self.memory: str = '' tmp = tag.find('td', {'class': 'status-cell'}) self.status: str = 'WAITING' if tmp is None or tmp.span is None else tmp.span[ 'submissionverdict'] if tmp.span is None and tmp['waiting'] == 'false': self.status = 'UNKNOWN' self.test: int = -1 if tmp is None or \ tmp.find('span', {'class': 'verdict-format-judged'}) is None else \ int(tmp.find('span', {'class': 'verdict-format-judged'}).text)
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard: """Parse a single gatherer page 'rightCol' entry.""" label_to_values = { row.find("div", class_="label") .getText(strip=True) .rstrip(":"): row.find("div", class_="value") for row in gatherer_column.find_all("div", class_="row") } card_name = label_to_values["Card Name"].getText(strip=True) card_types = label_to_values["Types"].getText(strip=True) flavor_lines = [] if "Flavor Text" in label_to_values: for flavorbox in label_to_values["Flavor Text"].find_all( "div", class_="flavortextbox" ): flavor_lines.append(flavorbox.getText(strip=True)) text_lines = [] if "Card Text" in label_to_values: for textbox in label_to_values["Card Text"].find_all( "div", class_="cardtextbox" ): text_lines.append(_replace_symbols(textbox).getText().strip()) return GathererCard( card_name=card_name, original_types=card_types, original_text="\n".join(text_lines).strip() or None, flavor_text="\n".join(flavor_lines).strip() or None, )
def get_hh_page_vacansies(vacans: bs4.element.Tag) -> []: vac_b = [] for vacancy in vacans.find_all('div', {'data-qa': 'vacancy-serp__vacancy'}): vac = {} vac['vacancy_name'] = vacancy.find_all( 'a', {'class': 'bloko-link'})[0].getText() vac['link'] = vacancy.find_all('a', {'class': 'bloko-link'})[0]['href'] vac['vacancy_company'] = vacancy.find_all( 'div', {'class': 'vacancy-serp-item__meta-info'})[0].find_all( 'a', {'class': 'bloko-link'})[0].getText() vac['vacancy_address'] = vacancy.find_all( 'span', {'data-qa': 'vacancy-serp__vacancy-address'})[0].getText() vacancy_describe = vacancy.find_all( 'div', {'class': 'g-user-content'})[0].find_all('div') vacancy_describe_text = '' for s in vacancy_describe: vacancy_describe_text = vacancy_describe_text + s.getText() vac['vacancy_describe_text'] = vacancy_describe_text vacancy_money = vacancy.find( 'span', {'data-qa': 'vacancy-serp__vacancy-compensation'}) if vacancy_money != None: vacancy_money = vacancy_money.getText() vac['vacancy_money'] = decoder(vacancy_money) vac['resource'] = 'hh.ru' vac_b.append(vac) return vac_b
def class_tr_to_str(tr: bs4.element.Tag) -> str: for a in tr.find_all("a"): a["href"] = BASE_API_URL + a["href"] data = (tr.find("td", class_="header"), tr.find("td", class_="description")) nameSpan = data[0].find("span", class_="element-name") if data[0].find("span", class_="attribute-type") is not None: accessType = "param" type_ = data[0].find("span", class_="param-type").text.strip() else: accessType = "func" if accessType == "param": attributeMode = data[0].find("span", class_="attribute-mode").text header = f"`{nameSpan.text} :: {type_}` {attributeMode}" else: header = f"`{nameSpan.text}`" contents = [item for item in data[1].contents if item != " "] if len(contents) > 0: if len(contents) > 1 and "\n" not in contents[0]: description = tomd.convert( f"<p>{''.join([str(item) for item in contents[:-1]])}</p>" ).strip() else: description = contents[0].strip() return f"{header} - {description}" else: return header
def parse_manually(self, parse_object: bs4.element.Tag) -> dict: """ Method which is dedicated to manuall parse broken html Input: parse_object = object which we would parse Output: dict """ list_column_names = [str(v) for v in parse_object.find_all('b')] parse_object = str(parse_object) list_column_names.insert(0, '</a>') list_split = [] for types in list_column_names: if types in list_column_names: list_split.append(types) parse_object = parse_object.replace(types, self.rand) parse_split = parse_object.split(self.rand) if '</a>' in list_split: list_split[0] = sp.status_iasa list_split = [self.remove_tags(x) for x in list_split] list_split = [self.remove_special(x) for x in list_split] list_split = [self.remove_spaces(x) for x in list_split] list_split = [v for v in list_split if v] value_dict = {} if len(parse_split) > 1: for column_value, value in zip(list_split, parse_split[1:]): value_dict.update( self.make_further_check(sp.rechange_iasa[column_value], value, sp.rechange_phrase)) return value_dict
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard: """Parse a single gatherer page 'rightCol' entry.""" label_to_values = { row.find("div", class_="label").getText(strip=True).rstrip(":"): row.find("div", class_="value") for row in gatherer_column.find_all("div", class_="row") } card_name = label_to_values["Card Name"].getText(strip=True) card_types = label_to_values["Types"].getText(strip=True) flavor_lines = [] if "Flavor Text" in label_to_values: for flavorbox in label_to_values["Flavor Text"].find_all( "div", class_="flavortextbox"): flavor_lines.append(flavorbox.getText(strip=True)) text_lines = [] if "Card Text" in label_to_values: for textbox in label_to_values["Card Text"].find_all( "div", class_="cardtextbox"): text_lines.append(_replace_symbols(textbox).getText().strip()) return GathererCard( card_name=card_name, original_types=card_types, original_text="\n".join(text_lines).strip() or None, flavor_text="\n".join(flavor_lines).strip() or None, )
def extract_stadium_data(stadium: bs.element.Tag) -> dict: stadium_data = { "stadium_name": stadium.find('a').get("title"), "stadium_nick": stadium.find('a').text, "stadium_city": stadium.find_all('a')[1].text } return stadium_data
def parse_facility_header(tabulka: bs4.element.Tag) -> dict: """Parse metadata about facility from html table. """ d = {} rows = tabulka.find_all('tr') divs = rows[0].find_all('div') d['id'] = int(divs[0].get_text(strip=True).strip('Evidenční číslo: ')) d['nazev'] = divs[1].get_text(strip=True) soucasti_adresy = ('psc', 'obec', 'ulice', 'cp', 'okres', 'kraj') try: adresa = divs[2].get_text(strip=True) rozdelena_adresa = address.rozdel_adresu(address.uprav_adresu(adresa)) for k, v in zip(soucasti_adresy, rozdelena_adresa): d[k] = v # Adresa schází except IndexError: d.update({soucast: None for soucast in soucasti_adresy}) if len(rows) > 2: # Neschází informace o katastru for th, td in zip(rows[1].find_all('th'), rows[2].find_all('td')): d[prepare_key(th.get_text(strip=True))] = td.get_text(strip=True) return d
def process_page(page: bs4.element.Tag) -> str: paragraphs = [] for p in page.find_all('p'): paragraph = PDFDoc2Txt.consolidate_paragraph(p.text) paragraph = PDFDoc2Txt.normalize_footnote_citations(paragraph) if not paragraph: continue prev_paragraph_end = paragraphs[-1][ -1] if paragraphs and paragraphs[-1] else '' # .isalpha(): if prev_paragraph_end and (re.search(r'[a-zA-Z\-\,]', prev_paragraph_end) or (paragraph[0].islower())): paragraph = paragraphs[-1] + ' ' + paragraph paragraphs[-1] = paragraph else: paragraphs.append(paragraph) # for p in paragraphs: # doc = self.nlp(p) # self.sentences.extend(list(doc.sents)) paragraphs = '\n\n'.join(paragraphs) return paragraphs
def _get_article(self, article: newspaper.article.Article, li: bs4.element.Tag) -> str: text, top_image = None, None if article is not None: text = article.text top_img = article.top_image links = li.find_all("a", {"class": "info"}) naver_news_url = [link for link in links if link.text == "네이버뉴스"] if naver_news_url: naver_news_url = naver_news_url[0] naver_news_url = naver_news_url["href"].replace("&", "&") else: naver_news_url = None if text is None and naver_news_url is not None: try: a = Article(naver_news_url, language="ko") a.download() a.parse() text = a.text except: pass elif top_image is None and naver_news_url is not None: try: a = Article(naver_news_url, language="ko") a.download() a.parse() top_image = a.top_image except: pass return text, top_image
def _get_datetime( self, article: newspaper.article.Article, li: bs4.element.Tag, ) -> datetime: publish_date = None if article is not None: publish_date = article.publish_date if publish_date is None: date_list = li.find_all("span", {"class": "info"}) date = None if len(date_list) > 1: date = date_list[-1].text else: date = li.find("span", {"class": "info"}).text date = date.split()[0] if "분" in date: minutes = re.sub(r"[^\d+]", "", date) publish_date = datetime.now() - timedelta(minutes=int(minutes)) elif "시간" in date: hours = re.sub(r"[^\d+]", "", date) publish_date = datetime.now() - timedelta(hours=int(hours)) elif "일" in date: days = re.sub(r"[^\d+]", "", date) publish_date = datetime.now() - timedelta(days=int(days)) else: try: publish_date = datetime.strptime(date, "%Y.%m.%d.") except: pass return publish_date
def _parse_one_person(self, person: bs4.element.Tag, _stage: str, i: int) -> Dict: _trophy, _, _level, _name = [ i.text.strip() for i in person.find_all("div", attrs={"style": self._style_font_xs}) ] res = { "trophy": int(_trophy), "level": int(_level), "name": _name, "hero": self.hero_map[person.find("img").get("src")], "playerId": person.get("href").split("/")[-1], "isTeammate": False, } if _stage == "Duo Showdown": res["group"] = i // 2 res["is_mvp"] = np.nan elif _stage == "Showdown": res["group"] = i res["is_mvp"] = np.nan else: res["group"] = np.nan res["is_mvp"] = person.find("img", attrs={"src": self._img_mvp}) is not None return res
def process_list_el(sp: BeautifulSoup, list_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict): """ Process list element :param sp: :param list_el: :param section_info: :param bib_map: :param ref_map: :return: """ # TODO: currently parsing list as a list of paragraphs (append numbers to start of each entry in ordered lists) list_items = [] for item in list_el.find_all('item'): # skip itemize settings if item.text.strip().startswith('[') and item.text.strip().endswith(']'): continue # try processing as paragraph list_num = item.get('id-text', None) item_as_para = process_paragraph(sp, item, section_info, bib_map, ref_map) # append list number if ordered if list_num: list_num_str = f'{list_num}. ' # iterate cite spans new_cite_spans = [] for span in item_as_para.cite_spans: new_cite_spans.append({ "start": span['start'] + len(list_num_str), "end": span['end'] + len(list_num_str), "text": span['text'] }) # iterate ref spans new_ref_spans = [] for span in item_as_para.ref_spans: new_ref_spans.append({ "start": span['start'] + len(list_num_str), "end": span['end'] + len(list_num_str), "text": span['text'] }) # iterate equation spans new_eq_spans = [] for span in item_as_para.eq_spans: new_eq_spans.append({ "start": span['start'] + len(list_num_str), "end": span['end'] + len(list_num_str), "text": span['text'], "latex": span['latex'], "ref_id": span['ref_id'] }) new_para = Paragraph( text=list_num_str + item_as_para.text, cite_spans=new_cite_spans, ref_spans=new_ref_spans, eq_spans=new_eq_spans, section=item_as_para.section ) else: new_para = item_as_para list_items.append(new_para) return list_items
def _analyse_version(version: bs4.element.Tag, season: int, episode: int): # Only analyse English rows for row in version.find_all("div", attrs={"class": "row"}): if row.find_all("span", attrs={"class": "flag-icon flag-icon-gb"}): # Only analyse subtitles which are complete if row.find_all(text="Completed"): if _download_srt(row, season, episode): return True
def _unplayed_games(M: int, N: int, table: bs4.element.Tag) -> pd.DataFrame: return pd.DataFrame(data=[[ not td.text or td.has_attr('class') and td['class'][0] == 'unplayed' for td in tr.find_all('td')[-N:] ] for tr in table.find_all('tr')[4:4 + M]], columns=pd.MultiIndex.from_tuples([ ('Unplayed', str(n + 1)) for n in range(N) ]))
def find_courses_from_section(self, section: bs4.element.Tag): if section is None: return [] else: course_sections = section.find_all(class_='m-single-course-top-row') courses = list(map(lambda x: x.find('span'), course_sections)) return list(map(lambda x: x.string, courses))
def row_to_transaction(row: bs4.element.Tag) -> Transaction: """ Convert HTML string of one <tr> with multiple <td> entries into named tuple. :param row: HTML string of the entire <tr> tag, including <td> tags :return: Transaction named tuple with corresponding entries """ args = (elem.text.strip() for elem in row.find_all('td')) return Transaction(*args)
def _parse_doc(self, doc: bs4.element.Tag) -> List: """ Parse one document. """ res = [] for example in doc.find_all('table', {'class': 'para'}): new_ex = self._parse_example(example) res += [new_ex] self._add_wordforms(new_ex.found_wordforms) return res
def parse_calendar_table(self, course_code, table: bs4.element.Tag) -> dict: data = {} calender_table_rows = table.find_all('tr', recursive=False)[1:] for row in calender_table_rows: day, row_data = self.parse_calendar_row(course_code, row) data[day] = row_data return data
def process_match_details(details: bs.element.Tag) -> dict: hour, goals_home, _, goals_visitor, more_info = details.find_all('td') hour = {"hour": hour.text.replace('h', ':').strip()} goals_home_team = extract_goals_time(goals_home, "goals_home_team") goals_visitor_team = extract_goals_time(goals_visitor, "goals_visitor_team") more_info = extract_more_info(more_info) return {**hour, **goals_home_team, **goals_visitor_team, **more_info}
def play_items_from_log_entry(entry: bs4.element.Tag) -> bs4.element.ResultSet: return entry.find_all("td") def player(play_text_: str) -> str: return ' '.join(play_text_.split()[:2])