Ejemplo n.º 1
0
def extract_anime_names(sel: Selector) -> Iterable[AnimeName]:
    """Extract the names of the anime."""
    main_title_text = get_all_text(sel.xpath("//h1/span[@itemprop='name']"))
    yield AnimeName(name=main_title_text, is_primary=True)
    alternative_titles_sel = xpath_slice_between(
        select_sidebar(sel),
        lower_xpath="./h2[. = 'Alternative Titles']",
        upper_xpath="./br")
    for alternative_title_sel in alternative_titles_sel:
        maybe_many_names = get_all_text(
            alternative_title_sel.xpath("./span/following-sibling::text()"))
        for name in re.split(r"\b\s*,\s+(?=[A-Z][a-z])", maybe_many_names):
            yield AnimeName(name=name, is_primary=False)
Ejemplo n.º 2
0
def extract_anime_info_fields(sel: Selector) -> Iterable[KeyValuesPair]:
    """Extract info fields of the anime."""
    stat_sels = xpath_slice_between(select_sidebar(sel),
                                    lower_xpath="./h2[. = 'Information']",
                                    upper_xpath="./br")
    for stat_sel in stat_sels:
        key = re.sub(r"\s+", " ", get_all_text(stat_sel.xpath("./span")))\
            .strip()\
            .rstrip(":")
        # pylint: disable=line-too-long
        value_text = re.sub(r"\s+", " ", get_all_text(stat_sel).replace(key, "", 1))\
            .lstrip(":")\
            .strip()
        values = re.split(r"\b\s+,\s+\b", value_text)
        yield KeyValuesPair(key=key, values=values)
Ejemplo n.º 3
0
def extract_manga_roles(sel):
    """Extract the mangas the character appeared in."""
    anime_table = \
        sel.xpath("//div[text() = 'Mangaography']/following-sibling::table[1]")
    roles = []
    for row in anime_table.xpath(".//tr"):
        image_sel, text_sel = row.xpath(".//td")
        picture = image_sel.xpath(".//img").attrib["src"]
        title = get_all_text(text_sel.xpath("./a"))
        role = get_all_text(text_sel.xpath("./div"))
        if re.search(r"main", role, flags=re.IGNORECASE):
            role = "main"
        elif re.search(r"support(ing)?|secondary", role, flags=re.IGNORECASE):
            role = "secondary"
        roles.append({"name": title, "picture": picture, "role": role})
    return roles
Ejemplo n.º 4
0
def extract_info_fields(sel):
    """Extract the character's hair color."""
    stats_sel = select_basic_stats_section(sel)
    return [{
        "key":
        "Hair Color",
        "value":
        strip_field_name(
            get_all_text(
                stats_sel.xpath("div[re:test(., 'hair color', 'i')]")))
    }]
Ejemplo n.º 5
0
def extract_anime_description(sel: Selector) -> str:
    """Extract the anime description."""
    description_paragraph_sels = \
        sel.xpath("//span[@itemprop='description']/text()")
    paragraphs = []
    for paragraph_sel, is_last in lookahead(description_paragraph_sels):
        content = get_all_text(paragraph_sel)
        # pylint: disable=invalid-name
        IS_PROBABLY_SOURCE_MAX_LEN = 30
        # pylint: disable=line-too-long
        if is_last and \
                re.search(r"written|from|source|author", content, flags=re.I) and \
                len(content) <= IS_PROBABLY_SOURCE_MAX_LEN:
            continue
        paragraphs.append(content)
    return normalize_whitespace("\n".join(paragraphs))
Ejemplo n.º 6
0
def extract_character_names(sel: Selector) -> Iterable[AnimeCharacter]:
    """Extact the names of the anime characters."""
    # pylint: disable=line-too-long
    maybe_name_anchors = \
        sel.xpath("(//h2[contains(., 'Characters')]/following-sibling::div)[1]//a[not(./img)]")
    for maybe_name_anchor in maybe_name_anchors:
        href = maybe_name_anchor.attrib["href"]
        match = re.search(r"/character/\d+/(?P<name>[^/]+)$", href)
        if not match:
            continue
        name = re.sub(r"_+|\s+", " ", match.group("name")).strip()
        # pylint: disable=line-too-long
        role = get_all_text(
            maybe_name_anchor.xpath("./following-sibling::div/small"))
        if re.search(r"main", role, flags=re.IGNORECASE):
            role = "main"
        elif re.search(r"support(ing)?|secondary", role, flags=re.IGNORECASE):
            role = "secondary"
        yield AnimeCharacter(name=name, url=href, role=role)
Ejemplo n.º 7
0
def extract_en_jp_name(sel):
    """Extract the character's full name."""
    name_sel = select_name(sel)
    english_name = name_sel.css("::text").get().strip()
    japanese_name = get_all_text(name_sel.xpath("span")).strip("()")
    return {"en": [english_name], "jp": [japanese_name]}
Ejemplo n.º 8
0
def extract_heart_off_number(sel):
    """Extract the number of heart off emojis the character has recieved."""
    return re.sub(r"[^\d]|#", "",
                  get_all_text(sel.xpath("//h3[./span[@class = 'heartOff']]")))
Ejemplo n.º 9
0
def extract_top_hated_rank(sel):
    """Extract the character's top loved rank."""
    stats_sel = select_basic_stats_section(sel)
    return re.sub(
        r"[^\d]|#", "",
        get_all_text(stats_sel.xpath("div/a[re:test(@href, 'top-hated')]")))
Ejemplo n.º 10
0
def extract_nicknames(sel):
    """Extract the character's nickname."""
    return [strip_field_name(get_all_text(sel.xpath("//h2[@class = 'aka']")))]
Ejemplo n.º 11
0
def extract_names(sel):
    """Extract the character's full name."""
    return [get_all_text(sel.xpath("//h1[@itemprop = 'name']"))]
Ejemplo n.º 12
0
def extract_descriptions(sel):
    """Extract the description of the character."""
    return [get_all_text(sel.xpath("//div[@itemprop = 'description']"))]