Exemple #1
0
def class_tr_to_str(tr: bs4.element.Tag) -> str:
    for a in tr.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (tr.find("td", class_="header"), tr.find("td",
                                                    class_="description"))
    nameSpan = data[0].find("span", class_="element-name")
    if data[0].find("span", class_="attribute-type") is not None:
        accessType = "param"
        type_ = data[0].find("span", class_="param-type").text.strip()
    else:
        accessType = "func"
    if accessType == "param":
        attributeMode = data[0].find("span", class_="attribute-mode").text
        header = f"`{nameSpan.text} :: {type_}` {attributeMode}"
    else:
        header = f"`{nameSpan.text}`"

    contents = [item for item in data[1].contents if item != " "]
    if len(contents) > 0:
        if len(contents) > 1 and "\n" not in contents[0]:
            description = tomd.convert(
                f"<p>{''.join([str(item) for item in contents[:-1]])}</p>"
            ).strip()
        else:
            description = contents[0].strip()
        return f"{header} - {description}"
    else:
        return header
Exemple #2
0
def process_product(tag: bs4.element.Tag) -> models.GoodwillItem:
    try:
        title_str = tag.find("div", "title").text
        title_str = title_str.split("Bids: ")
        bids = 0
        if len(title_str) > 1:
            bids = int(title_str[1].strip())
        title = title_str[0].strip()
        item_number = tag.find("div", "product-number").contents[1]
        price_str = tag.find("div", "price").text.replace("Buy It Now",
                                                          "").strip()
        price = Decimal(re.sub(r"[^\d.]", "", price_str))
        end_date_str = tag.find(
            "div", "timer countdown product-countdown").get("data-countdown")
        end_date = dt.datetime.strptime(end_date_str, "%m/%d/%Y %I:%M:%S %p")
        return models.GoodwillItem(
            title=title,
            item_number=item_number,
            price=price,
            end_date=end_date,
            bids=bids,
        )
    except Exception as e:
        print(f"could not create GoodwillItem due to {e} for tag {tag}")
        return None
    def _parse_one_person(self, person: bs4.element.Tag, _stage: str,
                          i: int) -> Dict:
        _trophy, _, _level, _name = [
            i.text.strip()
            for i in person.find_all("div",
                                     attrs={"style": self._style_font_xs})
        ]
        res = {
            "trophy": int(_trophy),
            "level": int(_level),
            "name": _name,
            "hero": self.hero_map[person.find("img").get("src")],
            "playerId": person.get("href").split("/")[-1],
            "isTeammate": False,
        }

        if _stage == "Duo Showdown":
            res["group"] = i // 2
            res["is_mvp"] = np.nan
        elif _stage == "Showdown":
            res["group"] = i
            res["is_mvp"] = np.nan
        else:
            res["group"] = np.nan
            res["is_mvp"] = person.find("img",
                                        attrs={"src":
                                               self._img_mvp}) is not None
        return res
Exemple #4
0
def extract_stadium_data(stadium: bs.element.Tag) -> dict:
    stadium_data = {
        "stadium_name": stadium.find('a').get("title"),
        "stadium_nick": stadium.find('a').text,
        "stadium_city": stadium.find_all('a')[1].text
    }
    return stadium_data
Exemple #5
0
 def from_tag(cls, tag: bs4.element.Tag):
     category = tag.find("div", class_="cassetteitem_content-label").text
     title = tag.find("div", class_="cassetteitem_content-title").text
     address = tag.find("li", class_="cassetteitem_detail-col1").text
     # Use tuple avoid unhashable error during pandas.drop_duplicates
     transportation = tuple(div.text for div in tag.select("li.cassetteitem_detail-col2 div"))
     age, floors = (div.text for div in tag.select("li.cassetteitem_detail-col3 div"))
     return cls(category, title, address, transportation,
                parse_age(age), parse_floors(floors))
Exemple #6
0
    def get_bloger(self, tag: bs4.element.Tag):
        try:
            user_name = tag.find("span", attrs={"class": "ell2"}).a.text
            user_id = tag.find("span", attrs={
                "class": "ell2"
            }).a["href"].split("/")[-1]
        except AttributeError as e:
            print(e)
            user_name = None
            user_id = None

        return User(user_name, user_id)
Exemple #7
0
def _movie_item(item_soup: bs4.element.Tag) -> MovieItem:
    litpic_soup = item_soup.find('div', attrs={'class': 'litpic'})
    title_soup = item_soup.find('div', attrs={'class': 'title'})

    title_p_soups = title_soup.find_all('p')

    litpic_url = litpic_soup.a.img['src']
    subject_url = _url(litpic_soup.a['href'])
    title = title_p_soups[0].b.getText()
    other = title_p_soups[1].a.getText()
    info = title_p_soups[2].getText()
    star = title_p_soups[3].getText()

    return MovieItem(litpic_url, subject_url, title, other, info, star)
Exemple #8
0
def offers_iter(element: bs4.element.Tag, offers: list) -> list:
    ''' Get detail from offer'''
    name = element.h4.text.strip()
    price = get_price_fr_text(
        element.find('div', {
            'class': 'row-price'
        }).span.text)
    shop = element.find('div', {'class': 'shopname'}).text
    url = element.a['href']
    logger.debug('\nName: %s \nPrice: %s \nShop: %s', name, price, shop)
    tested = {'name': name, 'price': price, 'shop': shop, 'url': url}
    if (same_product(target, tested)):
        offers.append(tested)
    return offers
Exemple #9
0
def _movie_item(item_soup: bs4.element.Tag) -> MovieItem:  
    litpic_soup = item_soup.find('div', attrs={'class': 'litpic'})
    title_soup = item_soup.find('div', attrs={'class': 'title'})

    title_p_soups = title_soup.find_all('p')

    litpic_url = litpic_soup.a.img['src']
    subject_url = _url(litpic_soup.a['href'])
    title = title_p_soups[0].b.getText()
    other = title_p_soups[1].a.getText()
    info = title_p_soups[2].getText()
    star = title_p_soups[3].getText()

    return MovieItem(litpic_url, subject_url, title, other, info, star)
Exemple #10
0
    def _get_ambiguation(tag: bs4.element.Tag) -> str:
        """ Get pretty ambiguation from example.

        :return: 'disambiguated' or 'not disambiguated' or 'Not found'.
        """
        ambiguation = (tag.find('span', {'class': 'off'}) or
                       tag.find('span', {'class': 'on'}))
        if not ambiguation:
            return 'Not found'
        ambiguation = ambiguation.text.strip()

        # TODO: use regexp here
        # here ambiguation like '[...]'
        ambiguation = ambiguation[1:-1].strip()
        return ambiguation
Exemple #11
0
 def get_pronounce(p: bs4.element.Tag):
     return list(
         map(
             lambda x: re.match(r'(.*)(\[.*\])', x).groups(),
             p.find('ul').text.strip().split()
         )
     )
Exemple #12
0
def get_lecture(li: bs4.element.Tag) -> dict:
    title = li.span.text
    # print(f"title: {title}")
    temp_idx = title.rfind(" ")
    date = title[temp_idx + 1:]
    temp_idx = date.find("-")
    if temp_idx == 1:
        date = f"2021-0{date}"
    else:
        date = f"2021-{date}"
    # print(f"date: {date}")

    button = li.find("button")
    onclick_raw = button["onclick"]
    open_bracket = onclick_raw.find("(")
    onclick_raw = onclick_raw[open_bracket + 1:]
    close_bracket = onclick_raw.find(")")
    onclick_raw = onclick_raw[:close_bracket]
    raw_texts = onclick_raw.split(",")
    data_seq = raw_texts[2].strip()[1:-1]  # 자료 구분별 식별자
    part_seq = raw_texts[3].strip()[1:-1]  # 자료의 상위 식별자
    # print(f"data_seq: {data_seq}, part_seq: {part_seq}")
    remote_url = f"https://www.bookdonga.com/utility/download.donga?type=EXTRADATAFILE&fieldname=listen_flnm&data_seq={data_seq}&part_seq={part_seq}"
    # print(f"remote_url: {remote_url}")

    print(f"{date},{remote_url},{title}")
    return {
        "title": title,
        "date": date,
        "data_seq": data_seq,
        "part_seq": part_seq,
        "remote_url": remote_url
    }
Exemple #13
0
def extract_detail(item: bs4.element.Tag) -> DetailData:
    """商品に関するデータを抽出する

  Args:
    item (bs4.element.Tag): 商品情報

  Returns:
    商品に関するデータ
  """
    item_review_num = item.findAll(**NUM_REVIEWS_PATTERN_ARGS)
    if len(item_review_num) == 1:
        item_review_num = item_review_num[0].text.strip()

    else:
        item_review_num = [
            ir for ir in item_review_num if ir.text.strip().isdecimal()
        ]
        item_review_num = item_review_num[0].text.strip()

    item_star = item.findAll(**STAR_PATTERN_ARGS)
    if len(item_star) == 1:
        item_star = item_star[0].text.strip()

    else:
        item_star = item_star[-1].text.strip()

    item_review_num = int(item_review_num.replace(',', ''))
    item_star = item_star.split(' ')[-1]
    item_link = item.find(**LINK_PATTERN_ARGS).get('href')
    return DetailData(item_review_num, item_star, item_link)
Exemple #14
0
    def _get_datetime(
        self,
        article: newspaper.article.Article,
        li: bs4.element.Tag,
    ) -> datetime:
        publish_date = None
        if article is not None:
            publish_date = article.publish_date

        if publish_date is None:
            date_list = li.find_all("span", {"class": "info"})
            date = None
            if len(date_list) > 1:
                date = date_list[-1].text
            else:
                date = li.find("span", {"class": "info"}).text
                date = date.split()[0]

            if "분" in date:
                minutes = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(minutes=int(minutes))
            elif "시간" in date:
                hours = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(hours=int(hours))
            elif "일" in date:
                days = re.sub(r"[^\d+]", "", date)
                publish_date = datetime.now() - timedelta(days=int(days))
            else:
                try:
                    publish_date = datetime.strptime(date, "%Y.%m.%d.")
                except:
                    pass
        return publish_date
 def _parse_one_block(self, block: bs4.element.Tag) -> Dict:
     _result = block.find("div", class_=self._class_res).text.strip()
     _stage, _rewards = [
         i.text.strip()
         for i in block.find_all("div", attrs={"style": self._style_font_m})
     ]
     _type, _time, _map = [
         i.text.strip()
         for i in block.find_all("div", attrs={"style": self._style_font_s})
     ]
     people = block.find_all("a")
     _people = [
         self._parse_one_person(person, _stage, i)
         for i, person in enumerate(people)
     ]
     _people = self._assign_teammates(_people, _stage)
     return {
         "match": _result,
         "stage": _stage,
         "map": _map,
         "rewards": int(_rewards),
         "type": _type,
         "time": _time,
         "players": _people,
     }
Exemple #16
0
def _group_activity_standings_results(
        eid: int, gid: int, table: bs4.element.Tag) -> Tuple[pd.DataFrame]:
    cross_table = _cross_table(eid, gid, table)
    last_row = cross_table.tail(1)
    results_from = str(last_row.iloc[0, 2])
    sep = 'Results from: '
    if results_from.startswith(sep):
        cross_table.drop(last_row.index, inplace=True)
        file_from, file_date, file_name = _results_from(
            results_from.split(sep)[1])
    else:
        file_from, file_date, file_name = [np.nan] * 3
    # Elo (1978) notation:
    # M = number of players
    # N = number of games (here: number of rounds)
    M, N = cross_table.filter(regex='Results').shape
    cross_table = cross_table.join(_unplayed_games(M, N, table))
    group = (_group(eid, gid,
                    table.find('thead').find_all('tr')).assign(
                        M=M,
                        N=N,
                        file_from=file_from,
                        file_date=file_date,
                        file_name=file_name))
    activity = _activity(cross_table)
    standings = _standings(cross_table)
    results = _results(cross_table)
    return group, activity, standings, results
Exemple #17
0
def define_tr_to_str(tr: bs4.element.Tag) -> str:
    for a in tr.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (tr.find("td",
                    class_="header").string, tr.find("td",
                                                     class_="description"))
    contents = data[1].contents
    if len(contents) > 0:
        if len(contents) > 1 and "\n" not in contents[0]:
            description = tomd.convert(
                f"<p>{''.join([str(item) for item in contents[:-1]])}</p>"
            ).strip()
        else:
            description = contents[0].split('\n')[0].strip()
        return f"`{data[0]}` - {description}"
    else:
        return f"`{data[0]}`"
Exemple #18
0
 def _get_graphic_url(content: bs4.element.Tag) -> str or None:
     """ Get URL to the graphic. """
     a = content.find('a', {'target': '_blank'})
     try:
         link = a['href']
     except (KeyError, TypeError, AttributeError):
         return
     return f"{BASE_RNC_URL}/{link}"
Exemple #19
0
    def __init__(self, tag: bs4.element.Tag):
        self.id: int = int(tag.find('td', {'class': 'id-cell'}).text)
        self.author: str = tag.find('td', {
            'class': 'status-party-cell'
        }).text.strip().rstrip()
        parameters = tag.find_all('td', {'class': 'status-small'})
        self.date: datetime = datetime.strptime(
            parameters[0].text.strip().rstrip(), '%d.%m.%Y %H:%M')
        self.problem: str = parameters[1].text.strip().rstrip()
        if tag is not None and tag.find(
                'td', {'class': 'time-consumed-cell'}) is not None:
            self.time: str = tag.find('td', {
                'class': 'time-consumed-cell'
            }).text.strip().rstrip()
        else:
            self.time: str = ''
        if tag is not None and tag.find(
                'td', {'class': 'memory-consumed-cell'}) is not None:
            self.memory: str = tag.find('td', {
                'class': 'memory-consumed-cell'
            }).text.strip().rstrip()
        else:
            self.memory: str = ''

        tmp = tag.find('td', {'class': 'status-cell'})
        self.status: str = 'WAITING' if tmp is None or tmp.span is None else tmp.span[
            'submissionverdict']
        if tmp.span is None and tmp['waiting'] == 'false':
            self.status = 'UNKNOWN'
        self.test: int = -1 if tmp is None or \
                               tmp.find('span', {'class': 'verdict-format-judged'}) is None else \
            int(tmp.find('span', {'class': 'verdict-format-judged'}).text)
Exemple #20
0
 def get_text_title(quote: bs4.element.Tag) -> Optional[str]:
     """Return title from html tag class as string"""
     try:
         title = quote.find(class_="authorOrTitle")
         title = title.nextSibling.nextSibling.text
         title = title.replace("\n", "")
         return title.strip()
     except AttributeError:
         return None
Exemple #21
0
 def get_text_author(quote: bs4.element.Tag) -> Optional[str]:
     """Return author from html tag class as string"""
     try:
         author = quote.find(class_="authorOrTitle").text
         author = author.replace(",", "")
         author = author.replace("\n", "")
         return author.strip()
     except AttributeError:
         return None
Exemple #22
0
 def from_td_tag(cls, quality: Quality, tag: bs4.element.Tag):
     try:
         s = tag.find('table').find('tr').find_all('td')[1].find('div').text
         matches = _price_regex.match(s)
         return PriceData(quality=quality,
                          quantity=int(matches[1]),
                          price=float(matches[2]))
     except Exception:
         pass
Exemple #23
0
 def get_text_tags(quote: bs4.element.Tag) -> Optional[list]:
     """Return tags from html tag class as string"""
     try:
         tags = quote.find(class_="greyText smallText left").text
         tags = [x.strip() for x in tags.split(',')]
         tags = tags[1:]
         return tags.strip()
     except AttributeError:
         return None
def downstream_points(ts: int, table1: bs4.element.Tag, table2: bs4.element.Tag) \
        -> typ.Generator[InfluxPoint, None, None]:
    global last_correct, last_uncorrect

    # Read upper table with power levels.
    _, channel, frequency, snr, modulation, power = \
            table1.find('tbody')('tr', recursive=False)

    channel_ids = [int(td.text) for td in datacells(channel)]
    snrs = [float(td.text.split(' ')[0]) for td in datacells(snr)]
    modulations = [td.text.strip() for td in datacells(modulation)]
    power_levels = [float(td.text.split(' ')[0]) for td in datacells(power)]

    # Read lower table with codeword counts.
    _, _, _, correctable, uncorrectable = table2.find('tbody')('tr',
                                                               recursive=False)

    correctables = [int(td.text) for td in datacells(correctable)]
    int_correctables = [
        v - last_correct.get(channel_ids[i], v)
        for i, v in enumerate(correctables)
    ]
    uncorrectables = [int(td.text) for td in datacells(uncorrectable)]
    int_uncorrectables = [
        v - last_uncorrect.get(channel_ids[i], v)
        for i, v in enumerate(uncorrectables)
    ]

    last_correct = {channel_ids[i]: v for i, v in enumerate(correctables)}
    last_uncorrect = {channel_ids[i]: v for i, v in enumerate(uncorrectables)}

    def field_set(i: int) -> InfluxSet:
        return {
            'snr_db': snrs[i],
            'modulation': modulations[i],
            'power_dbmv': power_levels[i],
            'interval_correctable_codewords': int_correctables[i],
            'interval_uncorrectable_codewords': int_uncorrectables[i]
        }

    yield from (InfluxPoint(measurement='downstream',
                            tag_set={'channel_id': cid},
                            field_set=field_set(i),
                            timestamp=ts) for i, cid in enumerate(channel_ids))
Exemple #25
0
def _parse_previous_price(product: bs4.element.Tag) -> decimal.Decimal:
    """
    Parse the fragment with the previous product price.
    If such fragment doesn't exist, assume there is no price reduction.
    """
    previousprice_tag = product.find("span", class_="as-price-previousprice")
    if previousprice_tag:
        return _extract_price(previousprice_tag.get_text())
    else:
        return _parse_current_price(product)
Exemple #26
0
    def _parse_media(self,
                     media: bs4.element.Tag) -> Tuple[str, str]:
        """ Get link to the media file and filepath. """
        try:
            media_link = media.find('a')['href']
        except Exception:
            raise

        media_link, filename = media_link.split('?name=')
        return media_link, self.MEDIA_FOLDER / filename
Exemple #27
0
    def contains_more_details(self, element:  bs4.element.Tag):
        more_detail_div = element.find("div", class_="TJUuge")
        if more_detail_div is None:
            return False

        spans = more_detail_div.findAll("span")
        for span in spans:
            if span.text == "Докладніше":
                return True
        return False
Exemple #28
0
    def get_noti_id(self, bs4_item_tag: bs4.element.Tag):
        noti_url = bs4_item_tag.find('link').get_text()
        url_params = [param for param in noti_url.split('?')[1].split('&')
                      ]  # 쿼리 파라메터 추출

        for p in url_params:
            if 'nttSn' in p:
                return int(p.split('=')[1])

        raise ValueError('Could not find post number (sttSn) in Link')
def get_other_page_urls_from_overview_page_stepbridge_my_results(page_soup: bs4.element.Tag) -> list:
    try:
        pagination_tag = page_soup.find('ul', {'class': 'pagination'})
        page_items = pagination_tag.find_all('li', {'class': 'page-item'})
        link_items = [page_item.find('a') for page_item in page_items if page_item.find('a') is not None]
        page_urls = [link_item['href'] for link_item in link_items]
        unique_page_urls = list(OrderedDict.fromkeys(page_urls))
        return unique_page_urls
    except AttributeError:
        return []
Exemple #30
0
def get_event_description(div: bs4.element.Tag) -> str:
    for a in div.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (div.select("div.element-content > p"),
            div.find("div", class_="detail-content"))
    paragraphs = []
    for p in data[0]:
        contents = p.contents
        if not (len(contents) == 1 and len(contents[0].strip()) == 0):
            paragraphs.append(html.unescape(tomd.convert(str(p))))
    return "\n".join([p.strip().replace("\n", "") for p in paragraphs])
Exemple #31
0
    def _parse_doc(self,
                   doc: bs4.element.Tag) -> List[Any]:
        """ Parse the documents to examples. """
        try:
            media = doc.find('td', {'valign': 'top'})
            example = doc.find('td', {'class': 'murco-snippet'})
        except ValueError:
            return []
        examples = []

        media_url, filename = self._parse_media(media)
        # for example in example:
        data_from_example = self._parse_example(example)

        new_ex = self.ex_type(*data_from_example, media_url, filename)
        new_ex.mark_found_words(self.marker)
        self._add_wordforms(new_ex.found_wordforms)
        examples += [new_ex]

        return examples
Exemple #32
0
    def __parse_author(self, review_item_el: bs4.element.Tag):
        author_el = review_item_el.find("a", class_="AMrStc")
        if author_el is not None:
            if "href" in author_el.attrs:
                return author_el.attrs['href']

        # author_el = review_item_el.find("a", class_="YhR3n")
        # if author_el is not None:
        #     if "href" in author_el.attrs:
        #         return author_el.attrs['href']

        return None