예제 #1
0
def scrape_tables(table: bs4.element.Tag):
    category = sanitize_str(table.findAll("tr")[0].text.strip())
    columns = json.dumps(
        list(map(sanitize_str,
                 table.findAll("tr")[1].stripped_strings)),
        allow_nan=False,
    )
    grand_total = [0]
    grand_available = [0]
    grand_occupied = [0]
    records = []
    doc = {"category": category, "columns": columns}
    start_time = time.time()
    _multi_process_scrape_partial = functools.partial(
        _multi_process_scrape,
        doc,
        records,
        category,
        grand_total,
        grand_available,
        grand_occupied,
    )
    list(map(_multi_process_scrape_partial, table.findAll("tr")[3:]))
    return {
        "grand_total_beds": grand_total[0],
        "grand_occupied_beds": grand_occupied[0],
        "grand_available_beds": grand_available[0],
        "category": category,
        "columns": columns,
        "records": json.dumps(records, allow_nan=False),
        "hid": hash(doc["category"]),
    }
예제 #2
0
def extract_detail(item: bs4.element.Tag) -> DetailData:
    """商品に関するデータを抽出する

  Args:
    item (bs4.element.Tag): 商品情報

  Returns:
    商品に関するデータ
  """
    item_review_num = item.findAll(**NUM_REVIEWS_PATTERN_ARGS)
    if len(item_review_num) == 1:
        item_review_num = item_review_num[0].text.strip()

    else:
        item_review_num = [
            ir for ir in item_review_num if ir.text.strip().isdecimal()
        ]
        item_review_num = item_review_num[0].text.strip()

    item_star = item.findAll(**STAR_PATTERN_ARGS)
    if len(item_star) == 1:
        item_star = item_star[0].text.strip()

    else:
        item_star = item_star[-1].text.strip()

    item_review_num = int(item_review_num.replace(',', ''))
    item_star = item_star.split(' ')[-1]
    item_link = item.find(**LINK_PATTERN_ARGS).get('href')
    return DetailData(item_review_num, item_star, item_link)
예제 #3
0
def parse_song_from_block(block: bs4.element.Tag) -> Song:
    for link in block.findAll("a"):
        if "title" in link.attrs.keys():
            title = link.attrs["title"]

    url = block.findAll("a", {"class": "-download-zip"})[0].attrs["href"]

    return Song(title, url)
예제 #4
0
def _parse_column(gatherer_column: bs4.element.Tag) -> GathererCard:
    """Parse a single gatherer page 'rightCol' entry."""
    label_to_values = {
        row.find("div", class_="label").getText(strip=True).rstrip(":"):
        row.find("div", class_="value")
        for row in gatherer_column.findAll("div", class_="row")
    }

    card_name = label_to_values["Card Name"].getText(strip=True)
    card_types = label_to_values["Types"].getText(strip=True)

    flavor_lines = []
    if "Flavor Text" in label_to_values:
        for flavorbox in label_to_values["Flavor Text"].findAll(
                "div", class_="flavortextbox"):
            flavor_lines.append(flavorbox.getText(strip=True))

    text_lines = []
    if "Card Text" in label_to_values:
        for textbox in label_to_values["Card Text"].findAll(
                "div", class_="cardtextbox"):
            text_lines.append(_replace_symbols(textbox).getText().strip())

    return GathererCard(
        card_name=card_name,
        original_types=card_types,
        original_text="\n".join(text_lines).strip() or None,
        flavor_text="\n".join(flavor_lines).strip() or None,
    )
예제 #5
0
 def tag2list(tag: bs4.element.Tag):
     result = list(map(lambda i: str(i.contents[0]), tag.findAll("p")))
     return result