def parse_thread_page(el: bs4.element.Tag) -> AttrDict:
    out = AttrDict()
    out.user = el.select('.postprofile dt')[0].text.strip()
    out.body_html = str(el.select('.content')[0]).strip()
    out.body_text = el.select('.content')[0].text.strip()
    out.date = el.select('.postbody .author')[0].text.strip()
    return out
def parse_link(link: bs4.element.Tag, domain: str) -> AttrDict:
    out = AttrDict()
    out.title = link.select('a:nth-of-type(1)')[0].text
    out.views = link.select('.views')[0].text.replace('Zugriffe', '').strip()
    out.answers = link.select('.posts')[0].text.replace('Antworten', '').strip()
    out.date = link.select('a:nth-of-type(3)')[0].text
    out.url = domain + link.select('a:nth-of-type(1)')[0].attrs['href'].replace('./', '/')
    return out
Example #3
0
def _movie_item(item_soup: bs4.element.Tag) -> MovieItem:  
    litpic_soup = item_soup.find('div', attrs={'class': 'litpic'})
    title_soup = item_soup.find('div', attrs={'class': 'title'})

    title_p_soups = title_soup.find_all('p')

    litpic_url = litpic_soup.a.img['src']
    subject_url = _url(litpic_soup.a['href'])
    title = title_p_soups[0].b.getText()
    other = title_p_soups[1].a.getText()
    info = title_p_soups[2].getText()
    star = title_p_soups[3].getText()

    return MovieItem(litpic_url, subject_url, title, other, info, star)
Example #4
0
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard:
    """Parse a single gatherer page 'rightCol' entry."""
    label_to_values = {
        row.find("div", class_="label")
        .getText(strip=True)
        .rstrip(":"): row.find("div", class_="value")
        for row in gatherer_column.find_all("div", class_="row")
    }

    card_name = label_to_values["Card Name"].getText(strip=True)
    card_types = label_to_values["Types"].getText(strip=True)

    flavor_lines = []
    if "Flavor Text" in label_to_values:
        for flavorbox in label_to_values["Flavor Text"].find_all(
            "div", class_="flavortextbox"
        ):
            flavor_lines.append(flavorbox.getText(strip=True))

    text_lines = []
    if "Card Text" in label_to_values:
        for textbox in label_to_values["Card Text"].find_all(
            "div", class_="cardtextbox"
        ):
            text_lines.append(_replace_symbols(textbox).getText().strip())

    return GathererCard(
        card_name=card_name,
        original_types=card_types,
        original_text="\n".join(text_lines).strip() or None,
        flavor_text="\n".join(flavor_lines).strip() or None,
    )
Example #5
0
File: yahoo.py Project: zdict/zdict
 def get_pronounce(p: bs4.element.Tag):
     return list(
         map(
             lambda x: re.match(r'(.*)(\[.*\])', x).groups(),
             p.find('ul').text.strip().split()
         )
     )
Example #6
0
def get_first_detail_annuncio(annuncio: bs4.element.Tag) -> tuple:
    """
		Ottiene il codice dell'annuncio e il link 
		della pagina di dettaglio dell'annuncio

		Returns:
		--------
		code_annuncio: str
			stringa che rappresenta il codice dell'annuncio
		link_annuncio: str
			link alla pagina di dettaglio dell'annuncio
	"""

    code_annuncio = annuncio.find('dd').text
    link_annuncio = annuncio.find_all('a')[-1]['href']

    return code_annuncio, link_annuncio
Example #7
0
 def _get_graphic_url(content: bs4.element.Tag) -> str or None:
     """ Get URL to the graphic. """
     a = content.find('a', {'target': '_blank'})
     try:
         link = a['href']
     except (KeyError, TypeError, AttributeError):
         return
     return f"{BASE_RNC_URL}/{link}"
Example #8
0
def _unplayed_games(M: int, N: int, table: bs4.element.Tag) -> pd.DataFrame:
    return pd.DataFrame(data=[[
        not td.text or td.has_attr('class') and td['class'][0] == 'unplayed'
        for td in tr.find_all('td')[-N:]
    ] for tr in table.find_all('tr')[4:4 + M]],
                        columns=pd.MultiIndex.from_tuples([
                            ('Unplayed', str(n + 1)) for n in range(N)
                        ]))
Example #9
0
def define_tr_to_str(tr: bs4.element.Tag) -> str:
    for a in tr.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (tr.find("td",
                    class_="header").string, tr.find("td",
                                                     class_="description"))
    contents = data[1].contents
    if len(contents) > 0:
        if len(contents) > 1 and "\n" not in contents[0]:
            description = tomd.convert(
                f"<p>{''.join([str(item) for item in contents[:-1]])}</p>"
            ).strip()
        else:
            description = contents[0].split('\n')[0].strip()
        return f"`{data[0]}` - {description}"
    else:
        return f"`{data[0]}`"
Example #10
0
def _analyse_version(version: bs4.element.Tag, season: int, episode: int):
    # Only analyse English rows
    for row in version.find_all("div", attrs={"class": "row"}):
        if row.find_all("span", attrs={"class": "flag-icon flag-icon-gb"}):
            # Only analyse subtitles which are complete
            if row.find_all(text="Completed"):
                if _download_srt(row, season, episode):
                    return True
    def find_courses_from_section(self, section: bs4.element.Tag):
        if section is None:
            return []
        else:
            course_sections = section.find_all(class_='m-single-course-top-row')

            courses = list(map(lambda x: x.find('span'), course_sections))
            return list(map(lambda x: x.string, courses))
Example #12
0
def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict):
    """
    Replace all references in element with special tokens
    :param sp:
    :param el:
    :param ref_map:
    :return:
    """
    # replace all citations with cite keyword
    for cite in el.find_all('cit'):
        try:
            target = cite.ref.get('target').replace('bid', 'BIBREF')
            cite.replace_with(sp.new_string(f" {target} "))
        except AttributeError:
            print('Attribute error: ', cite)
            continue

    # replace all non citation references
    for rtag in el.find_all('ref'):
        try:
            if rtag.get('target') and not rtag.get('target').startswith('bid'):
                if rtag.get('target').startswith('cid'):
                    target = rtag.get('target').replace('cid', 'SECREF')
                elif rtag.get('target').startswith('uid'):
                    if rtag.get('target').replace('uid', 'FIGREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'FIGREF')
                    elif rtag.get('target').replace('uid', 'TABREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'TABREF')
                    elif rtag.get('target').replace('uid', 'EQREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'EQREF')
                    elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map:
                        target = rtag.get('target').replace('uid', 'FOOTREF')
                    elif rtag.get('target').replace('uid', 'SECREFU') in ref_map:
                        target = rtag.get('target').replace('uid', 'SECREFU')
                    else:
                        target = rtag.get('target').upper()
                else:
                    print('Weird ID!')
                    target = rtag.get('target').upper()
                rtag.replace_with(sp.new_string(f" {target} "))
        except AttributeError:
            print('Attribute error: ', rtag)
            continue

    return el
Example #13
0
 def link_to_folder(link: bs4.element.Tag) -> str:
     raw_url: str = link.get("href", default="")
     url: ParseResult = urlparse(raw_url)
     if url.scheme or url.netloc:
         return ""
     url_path: str = posixpath.normpath(url.path)
     if "/" in url_path or url_path == "." or url_path == "..":
         return ""
     return url_path
Example #14
0
 def get_text_author(quote: bs4.element.Tag) -> Optional[str]:
     """Return author from html tag class as string"""
     try:
         author = quote.find(class_="authorOrTitle").text
         author = author.replace(",", "")
         author = author.replace("\n", "")
         return author.strip()
     except AttributeError:
         return None
Example #15
0
def clean_tags(el: bs4.element.Tag):
    """
    Replace all tags with lowercase version
    :param el:
    :return:
    """
    for sub_tag in SUBSTITUTE_TAGS:
        for sub_el in el.find_all(sub_tag):
            sub_el.name = sub_tag.lower()
Example #16
0
def _download_srt(row: bs4.element.Tag, season: int, episode: int):
    for link in row.find_all("a", attrs={"rel": "nofollow"}):
        url = f"{BASE_URL}/{link['href']}"
        con = requests.get(url)
        s = f"0{season}" if season < 10 else str(season)
        e = f"0{episode}" if episode < 10 else str(episode)
        with open(f"{FOLDER}/{NAME}.S{s}E{e}.srt", "wb") as f:
            f.write(con.content)
        return True
Example #17
0
 def get_text_tags(quote: bs4.element.Tag) -> Optional[list]:
     """Return tags from html tag class as string"""
     try:
         tags = quote.find(class_="greyText smallText left").text
         tags = [x.strip() for x in tags.split(',')]
         tags = tags[1:]
         return tags.strip()
     except AttributeError:
         return None
Example #18
0
 def get_text_title(quote: bs4.element.Tag) -> Optional[str]:
     """Return title from html tag class as string"""
     try:
         title = quote.find(class_="authorOrTitle")
         title = title.nextSibling.nextSibling.text
         title = title.replace("\n", "")
         return title.strip()
     except AttributeError:
         return None
Example #19
0
 def from_td_tag(cls, quality: Quality, tag: bs4.element.Tag):
     try:
         s = tag.find('table').find('tr').find_all('td')[1].find('div').text
         matches = _price_regex.match(s)
         return PriceData(quality=quality,
                          quantity=int(matches[1]),
                          price=float(matches[2]))
     except Exception:
         pass
Example #20
0
def row_to_transaction(row: bs4.element.Tag) -> Transaction:
    """
    Convert HTML string of one <tr> with multiple <td> entries into named tuple.

    :param row: HTML string of the entire <tr> tag, including <td> tags
    :return: Transaction named tuple with corresponding entries
    """
    args = (elem.text.strip() for elem in row.find_all('td'))
    return Transaction(*args)
Example #21
0
    def parse_calendar_table(self, course_code,
                             table: bs4.element.Tag) -> dict:
        data = {}
        calender_table_rows = table.find_all('tr', recursive=False)[1:]
        for row in calender_table_rows:
            day, row_data = self.parse_calendar_row(course_code, row)
            data[day] = row_data

        return data
Example #22
0
 def _parse_doc(self,
                doc: bs4.element.Tag) -> List:
     """ Parse one document. """
     res = []
     for example in doc.find_all('table', {'class': 'para'}):
         new_ex = self._parse_example(example)
         res += [new_ex]
         self._add_wordforms(new_ex.found_wordforms)
     return res
Example #23
0
File: yahoo.py Project: zdict/zdict
 def get_explain(e: bs4.element.Tag):
     def f(ks):
         return (
             'pos' if 'pos_button' in ks else
             'explain' if 'dictionaryExplanation' in ks else
             '?')
     return [
         (f(m.attrs['class']), m.text)
         for n in e.select('ul > li') for m in n.select('div')]
Example #24
0
def process_match_details(details: bs.element.Tag) -> dict:
    hour, goals_home, _, goals_visitor, more_info = details.find_all('td')
    hour = {"hour": hour.text.replace('h', ':').strip()}
    goals_home_team = extract_goals_time(goals_home, "goals_home_team")
    goals_visitor_team = extract_goals_time(goals_visitor,
                                            "goals_visitor_team")
    more_info = extract_more_info(more_info)

    return {**hour, **goals_home_team, **goals_visitor_team, **more_info}
Example #25
0
def downstream_points(ts: int, table1: bs4.element.Tag, table2: bs4.element.Tag) \
        -> typ.Generator[InfluxPoint, None, None]:
    global last_correct, last_uncorrect

    # Read upper table with power levels.
    _, channel, frequency, snr, modulation, power = \
            table1.find('tbody')('tr', recursive=False)

    channel_ids = [int(td.text) for td in datacells(channel)]
    snrs = [float(td.text.split(' ')[0]) for td in datacells(snr)]
    modulations = [td.text.strip() for td in datacells(modulation)]
    power_levels = [float(td.text.split(' ')[0]) for td in datacells(power)]

    # Read lower table with codeword counts.
    _, _, _, correctable, uncorrectable = table2.find('tbody')('tr',
                                                               recursive=False)

    correctables = [int(td.text) for td in datacells(correctable)]
    int_correctables = [
        v - last_correct.get(channel_ids[i], v)
        for i, v in enumerate(correctables)
    ]
    uncorrectables = [int(td.text) for td in datacells(uncorrectable)]
    int_uncorrectables = [
        v - last_uncorrect.get(channel_ids[i], v)
        for i, v in enumerate(uncorrectables)
    ]

    last_correct = {channel_ids[i]: v for i, v in enumerate(correctables)}
    last_uncorrect = {channel_ids[i]: v for i, v in enumerate(uncorrectables)}

    def field_set(i: int) -> InfluxSet:
        return {
            'snr_db': snrs[i],
            'modulation': modulations[i],
            'power_dbmv': power_levels[i],
            'interval_correctable_codewords': int_correctables[i],
            'interval_uncorrectable_codewords': int_uncorrectables[i]
        }

    yield from (InfluxPoint(measurement='downstream',
                            tag_set={'channel_id': cid},
                            field_set=field_set(i),
                            timestamp=ts) for i, cid in enumerate(channel_ids))
Example #26
0
def extract_match_data(tb_match: bs.element.Tag) -> dict:
    rows = tb_match.find_all('tr')
    if not rows:
        return {}

    header, details = rows[:2]

    header = process_match_header(header=header)
    details = process_match_details(details=details)
    return {**header, **details}
Example #27
0
def extract_more_info(more_info: bs.element.Tag) -> dict:
    all_elements = more_info.find_all('b')
    audience = [i for i in all_elements if i.text == 'Público:']
    audience = audience[0].nextSibling.strip().replace(' ', '').replace(
        '.', '') if audience else ""
    income = [i for i in all_elements if i.text == 'Renda:']
    income = income[0].nextSibling.strip().replace(',', '.') if income else ""

    info = {"audience": audience, "income": income}
    return info
Example #28
0
def process_match_header(header: bs.element.Tag) -> dict:
    day_month, home, result, visitor, stadium = header.find_all('td')

    day_month = {"day_month": day_month.text.strip()}
    home = extract_team_data(team=home, st_home=True)
    result = {"result": result.text.strip()}
    visitor = extract_team_data(team=visitor, st_home=False)
    stadium = extract_stadium_data(stadium=stadium)

    return {**day_month, **home, **result, **visitor, **stadium}
Example #29
0
    def get_noti_id(self, bs4_item_tag: bs4.element.Tag):
        noti_url = bs4_item_tag.find('link').get_text()
        url_params = [param for param in noti_url.split('?')[1].split('&')
                      ]  # 쿼리 파라메터 추출

        for p in url_params:
            if 'nttSn' in p:
                return int(p.split('=')[1])

        raise ValueError('Could not find post number (sttSn) in Link')
Example #30
0
def get_other_page_urls_from_overview_page_stepbridge_my_results(page_soup: bs4.element.Tag) -> list:
    try:
        pagination_tag = page_soup.find('ul', {'class': 'pagination'})
        page_items = pagination_tag.find_all('li', {'class': 'page-item'})
        link_items = [page_item.find('a') for page_item in page_items if page_item.find('a') is not None]
        page_urls = [link_item['href'] for link_item in link_items]
        unique_page_urls = list(OrderedDict.fromkeys(page_urls))
        return unique_page_urls
    except AttributeError:
        return []
Example #31
0
def scale_an_image_with_css(img: bs4.element.Tag, height: int) -> None:
    assert (height >= 0)
    if 'style' not in img.attrs:
        img.attrs['style'] = 'max-height:{height:d}px;'.format(height=height)
        return None

    style = img.attrs['style']
    m = re.search(r'max-height:[^;]*', style)

    if not m:
        if not style.endswith(';'):
            style += ';'
        style += 'max-height:{height:d}px;'.format(height=height)
        img.attrs['style'] = style
        return None

    img.attrs['style'] = re.sub(
        r'(?P<prefix>.*max-height:)[^;]*(.*)',
        r'\g<prefix>' + '{height:d}'.format(height=height) + r'px\2', style)
Example #32
0
def _parse_previous_price(product: bs4.element.Tag) -> decimal.Decimal:
    """
    Parse the fragment with the previous product price.
    If such fragment doesn't exist, assume there is no price reduction.
    """
    previousprice_tag = product.find("span", class_="as-price-previousprice")
    if previousprice_tag:
        return _extract_price(previousprice_tag.get_text())
    else:
        return _parse_current_price(product)
Example #33
0
    def contains_more_details(self, element:  bs4.element.Tag):
        more_detail_div = element.find("div", class_="TJUuge")
        if more_detail_div is None:
            return False

        spans = more_detail_div.findAll("span")
        for span in spans:
            if span.text == "Докладніше":
                return True
        return False
Example #34
0
def _expected(pid: int, table: bs4.element.Tag) -> pd.DataFrame:
    return (
        pd.read_html(str(table), header=[1, 2])[0].assign(pid=pid)
        # make 'pid' the first column
        .pipe(lambda df: df.
              loc[:, df.columns.to_list()[-1:] + df.columns.to_list()[:-1]]).
        assign(Unplayed=pd.Series([
            td['class'][0] == 'unplayed' if td.has_attr('class') else np.nan
            for tr in table.find_all('tr')[3:] for td in tr.find_all('td')[-1:]
        ])))
Example #35
0
    def _get_text(tag: bs4.element.Tag) -> str:
        """ Get pretty text from example and remove
        from there duplicate spaces.

        Here it is assumed, that all examples have text.
        """
        # using 'findall' method removes punctuation marks
        txt = tag.get_text()
        # remove duplicate spaces
        return clean_text_up(txt)
Example #36
0
    def _parse_media(self,
                     media: bs4.element.Tag) -> Tuple[str, str]:
        """ Get link to the media file and filepath. """
        try:
            media_link = media.find('a')['href']
        except Exception:
            raise

        media_link, filename = media_link.split('?name=')
        return media_link, self.MEDIA_FOLDER / filename
Example #37
0
File: yahoo.py Project: zdict/zdict
 def get_grammar(d: bs4.element.Tag):
     s = ('div#web ol.searchCenterMiddle '
          'div.dictionaryWordCard > ul > li')
     return list(map(text, d.select(s)))
Example #38
0
def play_items_from_log_entry(entry: bs4.element.Tag) -> bs4.element.ResultSet: return entry.find_all("td")


def player(play_text_: str) -> str: return ' '.join(play_text_.split()[:2])