Ejemplo n.º 1
0
 def _parse_one_block(self, block: bs4.element.Tag) -> Dict:
     _result = block.find("div", class_=self._class_res).text.strip()
     _stage, _rewards = [
         i.text.strip()
         for i in block.find_all("div", attrs={"style": self._style_font_m})
     ]
     _type, _time, _map = [
         i.text.strip()
         for i in block.find_all("div", attrs={"style": self._style_font_s})
     ]
     people = block.find_all("a")
     _people = [
         self._parse_one_person(person, _stage, i)
         for i, person in enumerate(people)
     ]
     _people = self._assign_teammates(_people, _stage)
     return {
         "match": _result,
         "stage": _stage,
         "map": _map,
         "rewards": int(_rewards),
         "type": _type,
         "time": _time,
         "players": _people,
     }
Ejemplo n.º 2
0
    def calculate_score(self, star_display: bs4.element.Tag) -> int:
        """Calculate numerical score from number of star-display elements.

        Args:
            star_display (bs4.element.Tag): RT website class for star-display's

        Raises:
            TypeError: Raised if input is not of type bs4.element.Tag
            TypeError: Raised if tag is not of class 'star-display'

        Returns:
            int: Numerical score.
        """
        if not isinstance(star_display, bs4.element.Tag):
            raise TypeError("Input must be of type bs4.element.Tag")
        if not star_display["class"][0] == "star-display":
            raise TypeError("Tag must be of class 'star-display'")

        full_star_count = len(
            star_display.find_all(class_="star-display__filled", recursive=False)
        )
        half_star_count = len(
            star_display.find_all(class_="star-display__half", recursive=False)
        )  # NOTE: Not too sure if BS4 has inbuilt function for this

        score = full_star_count + (half_star_count * 0.5)
        return score
Ejemplo n.º 3
0
    def dblp_contribs(
            self, elem: bs4.element.Tag
    ) -> List[fatcat_openapi_client.ReleaseContrib]:
        """
        - author (multiple; each a single string)
            => may have HTML entities
            => may have a number at the end, to aid with identifier creation
            => orcid
        - editor (same as author)
            => orcid
        """
        contribs = []
        index = 0
        for elem in elem.find_all("author"):
            contrib = self.dblp_contrib_single(elem)
            contrib.role = "author"
            contrib.index = index
            contribs.append(contrib)
            index += 1

        for elem in elem.find_all("editor"):
            contrib = self.dblp_contrib_single(elem)
            contrib.role = "editor"
            contribs.append(contrib)

        return contribs
Ejemplo n.º 4
0
    def extracting_extension_data(self, soup: bs4.element.Tag):
        """
        拡張機能により追記したDOM要素を除き、別の変数に格納する
        専用HTMLに情報を載せることにした
        """
        # classがRequestとDownloadの要素を集める
        request_elements: list[ResultSet] = soup.find_all(
            'p', attrs={'class': 'Request'})
        script_elements: Iterable[ResultSet] = soup.find_all(
            'p', attrs={'class': 'Script'})
        download_elements: list[ResultSet] = soup.find_all(
            'p', attrs={'class': 'Download'})
        history_elements: list[ResultSet] = soup.find_all(
            'p', attrs={'class': 'History'})

        # リクエストURLを集合に追加し、同じサーバ内のURLはまるまる保存、それ以外はホスト名だけ保存
        self.request_url_from_ex: Set[str] = set(
            [elm.get_text() for elm in request_elements])  # type: ignore

        # 拡張機能から Script のリクエストURLを取得する
        self.script_url_from_ex: Set[str] = set([
            fix_request_url(elm.get_text()) for elm in script_elements
        ]  # type: ignore
                                                )

        # downloadのURLを辞書のリストにし、soupの中身から削除する
        # download_info["数字"] = { URL, FileName, Mime, FileSize, TotalBytes, Danger, StartTime, Referrer } それぞれ辞書型
        download_info: dict[str, dict[str, str]] = dict()
        for elm in download_elements:  # type: ignore
            under: int = elm["id"].find("_")  # type: ignore
            key: str = elm["id"][under + 1:]
            if key not in download_info:
                download_info[key] = dict()
            if elm["id"][0:under] == "JsonData":
                try:
                    json_data = loads(elm.get_text())  # type: ignore
                except Exception as e:
                    print("webpage.py" + str(e), flush=True)
                    download_info[key].update({
                        "FileSize": "None",
                        "TotalBytes": "None",
                        "StartTime": "None",
                        "Danger": "None"
                    })  # 要素を追加しておかないと、参照時にKeyエラーが出る
                else:
                    download_info[key].update(json_data)
            else:
                download_info[key][elm["id"]
                                   [0:under]] = elm.get_text()  # type: ignore
        self.download_info = deepcopy(download_info)

        # URL遷移が起きた場合、記録する
        url_history: list[str] = [
            history_element.get_text() for history_element in history_elements
        ]  # type: ignore
        if len(url_history) < 2:
            url_history = list()
        self.among_url = url_history.copy()
Ejemplo n.º 5
0
def extract_abstract(meta: bs4.element.Tag, conf_year: int) -> str:
    if conf_year in [2020, 2021]:
        titles = meta.find_all('strong', {'class': 'note-content-field'})
        # print(titles)
        for i, t in enumerate(titles):
            if t.text.lower().find('abstract') >= 0:
                break
        return flatten_content_list(
            meta.find_all('span', {'class': 'note-content-value'})[i].contents)
Ejemplo n.º 6
0
def extract_team_data(team: bs.element.Tag, st_home: bool) -> dict:
    home_or_visitor = 'HOME' if st_home else 'VISITOR'
    team_data = {
        "team_wiki_" + home_or_visitor:
        team.find_all('a')[not st_home].get('href'),
        "team_name_" + home_or_visitor:
        team.find_all('a')[not st_home].get('title'),
        "team_nick_" + home_or_visitor:
        team.find_all('a')[not st_home].text,
        "team_state_" + home_or_visitor:
        team.find_all('a')[st_home].get('title').replace(' (estado)', '')
    }
    return team_data
Ejemplo n.º 7
0
    def _parse_example(self,
                       tag: bs4.element.Tag) -> Any:
        """ Parse a pair: original – translation to Example. """
        # this example is expected to have default args
        result_example = self.ex_type()

        langs = tag.find_all('td', {'class': "para-lang"})
        texts = tag.find_all('li')
        for lang, text in zip(langs, texts):
            lang = lang.text.strip()
            new_txt = self._parse_text(lang, text)
            result_example += new_txt
        return result_example
Ejemplo n.º 8
0
def get_form_details(form: bs4.element.Tag) -> dict:
    """Return a dict containing details about the given `form`"""
    details = {}
    # get the form action
    action = form.attrs.get("action")
    # get the form method
    method = form.attrs.get("method", "get").lower()
    # get the form name
    name = form.attrs.get("name")
    # get all the input details such as type and name
    inputs = []
    for input_tag in form.find_all("input"):
        input_type = input_tag.attrs.get("type")
        input_name = input_tag.attrs.get("name")
        input_value = input_tag.attrs.get("value")
        inputs.append({
            "type": input_type,
            "name": input_name,
            "value": input_value
        })

    selects = []
    for select_tag in form.find_all("select"):
        select_type = select_tag.attrs.get("type")
        select_name = select_tag.attrs.get("name")
        select_value = select_tag.attrs.get("value", "")

        selects.append({
            "type": select_type,
            "name": select_name,
            "value": select_value
        })

    textareas = []
    for textarea_tag in form.find_all("textarea"):
        textarea_name = textarea_tag.attrs.get("name")
        textarea_value = textarea_tag.attrs.get("value")
        textareas.append({"name": textarea_name, "value": textarea_value})

    # put everything to the resulting dictionary
    details["name"] = name
    details["action"] = action
    details["method"] = method
    details["inputs"] = inputs
    details["selects"] = selects
    details["textareas"] = textareas
    # TODO add line number of form

    return details
Ejemplo n.º 9
0
def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
    """
    Replace all references in element with special tokens
    :param sp:
    :param el:
    :param ref_map:
    :return:
    """
    # replace all citations with cite keyword
    for cite in el.find_all('cit'):
        try:
            target = cite.ref.get('target').replace('bid', 'BIBREF')
            cite.replace_with(sp.new_string(f" {target} "))
        except AttributeError:
            print('Attribute error: ', cite)
            continue

    # replace all non citation references
    for rtag in el.find_all('ref'):
        try:
            if rtag.get('target') and not rtag.get('target').startswith('bid'):
                if rtag.get('target').startswith('cid'):
                    target = rtag.get('target').replace('cid', 'SECREF')
                elif rtag.get('target').startswith('uid'):
                    if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'FIGREF')
                    elif rtag.get('target').replace('uid',
                                                    'TABREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'TABREF')
                    elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'EQREF')
                    elif rtag.get('target').replace('uid',
                                                    'FOOTREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'FOOTREF')
                    elif rtag.get('target').replace('uid',
                                                    'SECREFU') in ref_map:
                        target = rtag.get('target').replace('uid', 'SECREFU')
                    else:
                        target = rtag.get('target').upper()
                else:
                    print('Weird ID!')
                    target = rtag.get('target').upper()
                rtag.replace_with(sp.new_string(f" {target} "))
        except AttributeError:
            print('Attribute error: ', rtag)
            continue

    return el
Ejemplo n.º 10
0
    def table_to_data(self, table_tag: bs4.element.Tag):
        table_data = []
        rows = table_tag.find_all("tr", recursive=False)
        self.parse_buy_price_cell(rows[1].find_next("td"))
        ix = 0

        def get_sell_col_num():
            col_tags = rows[0].find_all("th", recursive=False)
            cols_text = [col.text.strip() for col in col_tags]
            padding = 0
            for col_tag in col_tags:
                col_text = col_tag.text.strip()
                if col_text == "Harvest" \
                        and 'colspan' in col_tag.attrs \
                        and int(col_tag['colspan']) == 2:
                    padding = 1
                    self.regrowth = True
                    break
            return cols_text.index("Sells For") + padding

        sell_column = get_sell_col_num()
        for row in rows[1:]:  # skip header
            cols = row.find_all("td", recursive=False)  # skip last 2
            if ix == 0:
                cols = cols[1:sell_column+1]
            ix += 1
            cols = [elem for elem in cols]  # get only stripped text
            table_data.append([elem for elem in cols])
        self.parse_sell_price_cell(table_data[0][sell_column-1])
        self.parse_harvest_data(table_data[1][sell_column-2])
        sell_tag = table_data[1][sell_column-1]
        for sell_match in price_re.finditer(sell_tag.text):
            self.gold_per_day.append(float(sell_match.group(1)))
Ejemplo n.º 11
0
    def _get_movie_spec_rating(self,
                               movie: bs4.element.Tag) -> Union[float, None]:
        """Private method to retrieve the movie rating according to the spectators.

        Args:
            movie (bs4.element.Tag): Parser results with the movie informations.

        Returns:
            Union[float, None]: The movie rating according to the spectators.
        """

        # get all the available ratings
        movie_ratings = movie.find_all("div", class_="rating-item")

        for ratings in movie_ratings:

            if "Spectateurs" in ratings.text:
                return float(
                    re.sub(
                        ",", ".",
                        ratings.find("span", {
                            "class": "stareval-note"
                        }).text))

        return None
Ejemplo n.º 12
0
    def __init__(self, tag: bs4.element.Tag):
        self.id: int = int(tag.find('td', {'class': 'id-cell'}).text)
        self.author: str = tag.find('td', {
            'class': 'status-party-cell'
        }).text.strip().rstrip()
        parameters = tag.find_all('td', {'class': 'status-small'})
        self.date: datetime = datetime.strptime(
            parameters[0].text.strip().rstrip(), '%d.%m.%Y %H:%M')
        self.problem: str = parameters[1].text.strip().rstrip()
        if tag is not None and tag.find(
                'td', {'class': 'time-consumed-cell'}) is not None:
            self.time: str = tag.find('td', {
                'class': 'time-consumed-cell'
            }).text.strip().rstrip()
        else:
            self.time: str = ''
        if tag is not None and tag.find(
                'td', {'class': 'memory-consumed-cell'}) is not None:
            self.memory: str = tag.find('td', {
                'class': 'memory-consumed-cell'
            }).text.strip().rstrip()
        else:
            self.memory: str = ''

        tmp = tag.find('td', {'class': 'status-cell'})
        self.status: str = 'WAITING' if tmp is None or tmp.span is None else tmp.span[
            'submissionverdict']
        if tmp.span is None and tmp['waiting'] == 'false':
            self.status = 'UNKNOWN'
        self.test: int = -1 if tmp is None or \
                               tmp.find('span', {'class': 'verdict-format-judged'}) is None else \
            int(tmp.find('span', {'class': 'verdict-format-judged'}).text)
Ejemplo n.º 13
0
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard:
    """Parse a single gatherer page 'rightCol' entry."""
    label_to_values = {
        row.find("div", class_="label")
        .getText(strip=True)
        .rstrip(":"): row.find("div", class_="value")
        for row in gatherer_column.find_all("div", class_="row")
    }

    card_name = label_to_values["Card Name"].getText(strip=True)
    card_types = label_to_values["Types"].getText(strip=True)

    flavor_lines = []
    if "Flavor Text" in label_to_values:
        for flavorbox in label_to_values["Flavor Text"].find_all(
            "div", class_="flavortextbox"
        ):
            flavor_lines.append(flavorbox.getText(strip=True))

    text_lines = []
    if "Card Text" in label_to_values:
        for textbox in label_to_values["Card Text"].find_all(
            "div", class_="cardtextbox"
        ):
            text_lines.append(_replace_symbols(textbox).getText().strip())

    return GathererCard(
        card_name=card_name,
        original_types=card_types,
        original_text="\n".join(text_lines).strip() or None,
        flavor_text="\n".join(flavor_lines).strip() or None,
    )
Ejemplo n.º 14
0
def get_hh_page_vacansies(vacans: bs4.element.Tag) -> []:
    vac_b = []
    for vacancy in vacans.find_all('div',
                                   {'data-qa': 'vacancy-serp__vacancy'}):
        vac = {}
        vac['vacancy_name'] = vacancy.find_all(
            'a', {'class': 'bloko-link'})[0].getText()
        vac['link'] = vacancy.find_all('a', {'class': 'bloko-link'})[0]['href']
        vac['vacancy_company'] = vacancy.find_all(
            'div', {'class': 'vacancy-serp-item__meta-info'})[0].find_all(
                'a', {'class': 'bloko-link'})[0].getText()
        vac['vacancy_address'] = vacancy.find_all(
            'span', {'data-qa': 'vacancy-serp__vacancy-address'})[0].getText()
        vacancy_describe = vacancy.find_all(
            'div', {'class': 'g-user-content'})[0].find_all('div')
        vacancy_describe_text = ''
        for s in vacancy_describe:
            vacancy_describe_text = vacancy_describe_text + s.getText()
        vac['vacancy_describe_text'] = vacancy_describe_text

        vacancy_money = vacancy.find(
            'span', {'data-qa': 'vacancy-serp__vacancy-compensation'})

        if vacancy_money != None:
            vacancy_money = vacancy_money.getText()
        vac['vacancy_money'] = decoder(vacancy_money)
        vac['resource'] = 'hh.ru'
        vac_b.append(vac)
    return vac_b
Ejemplo n.º 15
0
def class_tr_to_str(tr: bs4.element.Tag) -> str:
    for a in tr.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (tr.find("td", class_="header"), tr.find("td",
                                                    class_="description"))
    nameSpan = data[0].find("span", class_="element-name")
    if data[0].find("span", class_="attribute-type") is not None:
        accessType = "param"
        type_ = data[0].find("span", class_="param-type").text.strip()
    else:
        accessType = "func"
    if accessType == "param":
        attributeMode = data[0].find("span", class_="attribute-mode").text
        header = f"`{nameSpan.text} :: {type_}` {attributeMode}"
    else:
        header = f"`{nameSpan.text}`"

    contents = [item for item in data[1].contents if item != " "]
    if len(contents) > 0:
        if len(contents) > 1 and "\n" not in contents[0]:
            description = tomd.convert(
                f"<p>{''.join([str(item) for item in contents[:-1]])}</p>"
            ).strip()
        else:
            description = contents[0].strip()
        return f"{header} - {description}"
    else:
        return header
Ejemplo n.º 16
0
 def parse_manually(self, parse_object: bs4.element.Tag) -> dict:
     """
     Method which is dedicated to manuall parse broken html
     Input:  parse_object = object which we would parse
     Output: dict
     """
     list_column_names = [str(v) for v in parse_object.find_all('b')]
     parse_object = str(parse_object)
     list_column_names.insert(0, '</a>')
     list_split = []
     for types in list_column_names:
         if types in list_column_names:
             list_split.append(types)
             parse_object = parse_object.replace(types, self.rand)
     parse_split = parse_object.split(self.rand)
     if '</a>' in list_split:
         list_split[0] = sp.status_iasa
     list_split = [self.remove_tags(x) for x in list_split]
     list_split = [self.remove_special(x) for x in list_split]
     list_split = [self.remove_spaces(x) for x in list_split]
     list_split = [v for v in list_split if v]
     value_dict = {}
     if len(parse_split) > 1:
         for column_value, value in zip(list_split, parse_split[1:]):
             value_dict.update(
                 self.make_further_check(sp.rechange_iasa[column_value],
                                         value, sp.rechange_phrase))
     return value_dict
Ejemplo n.º 17
0
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard:
    """Parse a single gatherer page 'rightCol' entry."""
    label_to_values = {
        row.find("div", class_="label").getText(strip=True).rstrip(":"):
        row.find("div", class_="value")
        for row in gatherer_column.find_all("div", class_="row")
    }

    card_name = label_to_values["Card Name"].getText(strip=True)
    card_types = label_to_values["Types"].getText(strip=True)

    flavor_lines = []
    if "Flavor Text" in label_to_values:
        for flavorbox in label_to_values["Flavor Text"].find_all(
                "div", class_="flavortextbox"):
            flavor_lines.append(flavorbox.getText(strip=True))

    text_lines = []
    if "Card Text" in label_to_values:
        for textbox in label_to_values["Card Text"].find_all(
                "div", class_="cardtextbox"):
            text_lines.append(_replace_symbols(textbox).getText().strip())

    return GathererCard(
        card_name=card_name,
        original_types=card_types,
        original_text="\n".join(text_lines).strip() or None,
        flavor_text="\n".join(flavor_lines).strip() or None,
    )
Ejemplo n.º 18
0
def extract_stadium_data(stadium: bs.element.Tag) -> dict:
    stadium_data = {
        "stadium_name": stadium.find('a').get("title"),
        "stadium_nick": stadium.find('a').text,
        "stadium_city": stadium.find_all('a')[1].text
    }
    return stadium_data
Ejemplo n.º 19
0
def parse_facility_header(tabulka: bs4.element.Tag) -> dict:
    """Parse metadata about facility from html table.
    """
    d = {}

    rows = tabulka.find_all('tr')
    divs = rows[0].find_all('div')
    d['id'] = int(divs[0].get_text(strip=True).strip('Evidenční číslo: '))
    d['nazev'] = divs[1].get_text(strip=True)

    soucasti_adresy = ('psc', 'obec', 'ulice', 'cp', 'okres', 'kraj')
    try:
        adresa = divs[2].get_text(strip=True)
        rozdelena_adresa = address.rozdel_adresu(address.uprav_adresu(adresa))
        for k, v in zip(soucasti_adresy, rozdelena_adresa):
            d[k] = v
    # Adresa schází
    except IndexError:
        d.update({soucast: None for soucast in soucasti_adresy})

    if len(rows) > 2:  # Neschází informace o katastru
        for th, td in zip(rows[1].find_all('th'), rows[2].find_all('td')):
            d[prepare_key(th.get_text(strip=True))] = td.get_text(strip=True)

    return d
Ejemplo n.º 20
0
    def process_page(page: bs4.element.Tag) -> str:
        paragraphs = []

        for p in page.find_all('p'):
            paragraph = PDFDoc2Txt.consolidate_paragraph(p.text)
            paragraph = PDFDoc2Txt.normalize_footnote_citations(paragraph)
            if not paragraph:
                continue

            prev_paragraph_end = paragraphs[-1][
                -1] if paragraphs and paragraphs[-1] else ''

            # .isalpha():
            if prev_paragraph_end and (re.search(r'[a-zA-Z\-\,]',
                                                 prev_paragraph_end) or
                                       (paragraph[0].islower())):
                paragraph = paragraphs[-1] + ' ' + paragraph
                paragraphs[-1] = paragraph
            else:
                paragraphs.append(paragraph)

        # for p in paragraphs:
        #     doc = self.nlp(p)
        #     self.sentences.extend(list(doc.sents))

        paragraphs = '\n\n'.join(paragraphs)
        return paragraphs
Ejemplo n.º 21
0
    def _get_article(self, article: newspaper.article.Article,
                     li: bs4.element.Tag) -> str:
        text, top_image = None, None
        if article is not None:
            text = article.text
            top_img = article.top_image

        links = li.find_all("a", {"class": "info"})
        naver_news_url = [link for link in links if link.text == "네이버뉴스"]
        if naver_news_url:
            naver_news_url = naver_news_url[0]
            naver_news_url = naver_news_url["href"].replace("&amp;", "&")
        else:
            naver_news_url = None

        if text is None and naver_news_url is not None:
            try:
                a = Article(naver_news_url, language="ko")
                a.download()
                a.parse()

                text = a.text
            except:
                pass
        elif top_image is None and naver_news_url is not None:
            try:
                a = Article(naver_news_url, language="ko")
                a.download()
                a.parse()

                top_image = a.top_image
            except:
                pass
        return text, top_image
Ejemplo n.º 22
0
    def _get_datetime(
        self,
        article: newspaper.article.Article,
        li: bs4.element.Tag,
    ) -> datetime:
        publish_date = None
        if article is not None:
            publish_date = article.publish_date

        if publish_date is None:
            date_list = li.find_all("span", {"class": "info"})
            date = None
            if len(date_list) > 1:
                date = date_list[-1].text
            else:
                date = li.find("span", {"class": "info"}).text
                date = date.split()[0]

            if "분" in date:
                minutes = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(minutes=int(minutes))
            elif "시간" in date:
                hours = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(hours=int(hours))
            elif "일" in date:
                days = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(days=int(days))
            else:
                try:
                    publish_date = datetime.strptime(date, "%Y.%m.%d.")
                except:
                    pass
        return publish_date
Ejemplo n.º 23
0
    def _parse_one_person(self, person: bs4.element.Tag, _stage: str,
                          i: int) -> Dict:
        _trophy, _, _level, _name = [
            i.text.strip()
            for i in person.find_all("div",
                                     attrs={"style": self._style_font_xs})
        ]
        res = {
            "trophy": int(_trophy),
            "level": int(_level),
            "name": _name,
            "hero": self.hero_map[person.find("img").get("src")],
            "playerId": person.get("href").split("/")[-1],
            "isTeammate": False,
        }

        if _stage == "Duo Showdown":
            res["group"] = i // 2
            res["is_mvp"] = np.nan
        elif _stage == "Showdown":
            res["group"] = i
            res["is_mvp"] = np.nan
        else:
            res["group"] = np.nan
            res["is_mvp"] = person.find("img",
                                        attrs={"src":
                                               self._img_mvp}) is not None
        return res
Ejemplo n.º 24
0
def process_list_el(sp: BeautifulSoup, list_el: bs4.element.Tag, section_info: List, bib_map: Dict, ref_map: Dict):
    """
    Process list element
    :param sp:
    :param list_el:
    :param section_info:
    :param bib_map:
    :param ref_map:
    :return:
    """
    # TODO: currently parsing list as a list of paragraphs (append numbers to start of each entry in ordered lists)
    list_items = []
    for item in list_el.find_all('item'):
        # skip itemize settings
        if item.text.strip().startswith('[') and item.text.strip().endswith(']'):
            continue
        # try processing as paragraph
        list_num = item.get('id-text', None)
        item_as_para = process_paragraph(sp, item, section_info, bib_map, ref_map)
        # append list number if ordered
        if list_num:
            list_num_str = f'{list_num}. '
            # iterate cite spans
            new_cite_spans = []
            for span in item_as_para.cite_spans:
                new_cite_spans.append({
                    "start": span['start'] + len(list_num_str),
                    "end": span['end'] + len(list_num_str),
                    "text": span['text']
                })
            # iterate ref spans
            new_ref_spans = []
            for span in item_as_para.ref_spans:
                new_ref_spans.append({
                    "start": span['start'] + len(list_num_str),
                    "end": span['end'] + len(list_num_str),
                    "text": span['text']
                })
            # iterate equation spans
            new_eq_spans = []
            for span in item_as_para.eq_spans:
                new_eq_spans.append({
                    "start": span['start'] + len(list_num_str),
                    "end": span['end'] + len(list_num_str),
                    "text": span['text'],
                    "latex": span['latex'],
                    "ref_id": span['ref_id']
                })
            new_para = Paragraph(
                text=list_num_str + item_as_para.text,
                cite_spans=new_cite_spans,
                ref_spans=new_ref_spans,
                eq_spans=new_eq_spans,
                section=item_as_para.section
            )
        else:
            new_para = item_as_para
        list_items.append(new_para)
    return list_items
Ejemplo n.º 25
0
def _analyse_version(version: bs4.element.Tag, season: int, episode: int):
    # Only analyse English rows
    for row in version.find_all("div", attrs={"class": "row"}):
        if row.find_all("span", attrs={"class": "flag-icon flag-icon-gb"}):
            # Only analyse subtitles which are complete
            if row.find_all(text="Completed"):
                if _download_srt(row, season, episode):
                    return True
Ejemplo n.º 26
0
def _unplayed_games(M: int, N: int, table: bs4.element.Tag) -> pd.DataFrame:
    return pd.DataFrame(data=[[
        not td.text or td.has_attr('class') and td['class'][0] == 'unplayed'
        for td in tr.find_all('td')[-N:]
    ] for tr in table.find_all('tr')[4:4 + M]],
                        columns=pd.MultiIndex.from_tuples([
                            ('Unplayed', str(n + 1)) for n in range(N)
                        ]))
    def find_courses_from_section(self, section: bs4.element.Tag):
        if section is None:
            return []
        else:
            course_sections = section.find_all(class_='m-single-course-top-row')

            courses = list(map(lambda x: x.find('span'), course_sections))
            return list(map(lambda x: x.string, courses))
Ejemplo n.º 28
0
def row_to_transaction(row: bs4.element.Tag) -> Transaction:
    """
    Convert HTML string of one <tr> with multiple <td> entries into named tuple.

    :param row: HTML string of the entire <tr> tag, including <td> tags
    :return: Transaction named tuple with corresponding entries
    """
    args = (elem.text.strip() for elem in row.find_all('td'))
    return Transaction(*args)
Ejemplo n.º 29
0
 def _parse_doc(self,
                doc: bs4.element.Tag) -> List:
     """ Parse one document. """
     res = []
     for example in doc.find_all('table', {'class': 'para'}):
         new_ex = self._parse_example(example)
         res += [new_ex]
         self._add_wordforms(new_ex.found_wordforms)
     return res
Ejemplo n.º 30
0
    def parse_calendar_table(self, course_code,
                             table: bs4.element.Tag) -> dict:
        data = {}
        calender_table_rows = table.find_all('tr', recursive=False)[1:]
        for row in calender_table_rows:
            day, row_data = self.parse_calendar_row(course_code, row)
            data[day] = row_data

        return data
Ejemplo n.º 31
0
def process_match_details(details: bs.element.Tag) -> dict:
    hour, goals_home, _, goals_visitor, more_info = details.find_all('td')
    hour = {"hour": hour.text.replace('h', ':').strip()}
    goals_home_team = extract_goals_time(goals_home, "goals_home_team")
    goals_visitor_team = extract_goals_time(goals_visitor,
                                            "goals_visitor_team")
    more_info = extract_more_info(more_info)

    return {**hour, **goals_home_team, **goals_visitor_team, **more_info}
Ejemplo n.º 32
0
def play_items_from_log_entry(entry: bs4.element.Tag) -> bs4.element.ResultSet: return entry.find_all("td")


def player(play_text_: str) -> str: return ' '.join(play_text_.split()[:2])