def parse_thread_page(el: bs4.element.Tag) -> AttrDict:
    out = AttrDict()
    out.user = el.select('.postprofile dt')[0].text.strip()
    out.body_html = str(el.select('.content')[0]).strip()
    out.body_text = el.select('.content')[0].text.strip()
    out.date = el.select('.postbody .author')[0].text.strip()
    return out
def parse_link(link: bs4.element.Tag, domain: str) -> AttrDict:
    out = AttrDict()
    out.title = link.select('a:nth-of-type(1)')[0].text
    out.views = link.select('.views')[0].text.replace('Zugriffe', '').strip()
    out.answers = link.select('.posts')[0].text.replace('Antworten', '').strip()
    out.date = link.select('a:nth-of-type(3)')[0].text
    out.url = domain + link.select('a:nth-of-type(1)')[0].attrs['href'].replace('./', '/')
    return out
Esempio n. 3
0
 def from_tag(cls, tag: bs4.element.Tag):
     category = tag.find("div", class_="cassetteitem_content-label").text
     title = tag.find("div", class_="cassetteitem_content-title").text
     address = tag.find("li", class_="cassetteitem_detail-col1").text
     # Use tuple avoid unhashable error during pandas.drop_duplicates
     transportation = tuple(div.text for div in tag.select("li.cassetteitem_detail-col2 div"))
     age, floors = (div.text for div in tag.select("li.cassetteitem_detail-col3 div"))
     return cls(category, title, address, transportation,
                parse_age(age), parse_floors(floors))
Esempio n. 4
0
        def get_explain(e: bs4.element.Tag):
            def f(ks):
                return ('pos' if 'pos_button' in ks else
                        'explain' if 'dictionaryExplanation' in ks else '?')

            return [(f(m.attrs['class']), m.text) for n in e.select('ul > li')
                    for m in n.select('div')]
Esempio n. 5
0
def parse_review(review: bs4.element.Tag) -> dict:
    """
    INPUT:
    review: HTML segment that contains all relevant review information

    OUTPUT:
    d: dictionary of relevant review information
    """

    d = {}
    if review.select_one("div.rating-10 span"):
        d['rating'] = int(review.select_one("div.rating-10 span").text)
    d['headline'] = review.select_one("h2.text_header").text
    try:
        d['country'] = review.select_one('h3.text_sub_header').text\
            .replace(')', '(').split('(')[1]
    except IndexError:
        d['country'] = 'None'
    d['body'] = review.select_one("div.text_content").text.strip()
    rows = review.select('tr')
    for row in rows:
        if row.select('td')[1].attrs['class'][0] == 'review-rating-stars':
            for x in row.select('span'):
                try:
                    if x.attrs['class'] == ['star', 'fill']:
                        num = int(x.text)
                        d[row.td.attrs['class'][1]] = num
                except KeyError:
                    continue
        else:
            d[row.td.attrs['class'][1]] = row.select('td')[1].text
    return d
Esempio n. 6
0
def parse_event(event: bs4.element.Tag):
    """イベントひとつ分の要素から情報を抜き出す"""
    url = event.select_one('.events-list-item-title h3 a').get('href')
    community = event.select_one('.events-list-item-group a')
    community = community.text if community else None
    #     thumbnail = event.select_one('.event_thumbnail img').get('src')
    #     thumbnail = ''
    if re.search(r'/no_image_', thumbnail):
        thumbnail = None
    return Event(
        id=int(re.match(r'.+/(\d+)/?', url)[1]),
        title=event.select_one('.events-list-item-title h3 a span').text,
        url=url,
        dt_start=datetime.strptime(
            event.select_one('time').get('datetime'),
            '%Y-%m-%dT%H:%M:%S%z'),  #2019-10-12T13:00:00+09:00
        dt_end=datetime.strptime(
            event.select_one('time').get('datetime'), '%Y-%m-%dT%H:%M:%S%z'),
        #                  amount = event.select_one('.amount').text,
        #                  thumbnail = thumbnail,
        community=community,
        owner=community,
        place=''.join(
            map(lambda x: x.text,
                event.select('.events-list-item-venue > span'))))
Esempio n. 7
0
File: yahoo.py Progetto: zdict/zdict
 def get_explain(e: bs4.element.Tag):
     def f(ks):
         return (
             'pos' if 'pos_button' in ks else
             'explain' if 'dictionaryExplanation' in ks else
             '?')
     return [
         (f(m.attrs['class']), m.text)
         for n in e.select('ul > li') for m in n.select('div')]
Esempio n. 8
0
    def __init__(self, item: bs4.element.Tag):
        self.name = item.select_one('span.txt').text

        sub_menu = item.select('li.listItem a')
        if len(sub_menu) == 0:
            self.link, self.id = solve_link(item.select_one('a')['href'])
            return

        for item in sub_menu:
            self.append(sub_industry(item))
Esempio n. 9
0
def __find_link(article: bs4.element.Tag) -> str:
    links: Counter = Counter()
    first_link = ''
    for header in ['h1', 'h2', 'h3']:
        header_link = article.select(f'{header} a[href]')
        if header_link:
            return header_link[0].attrs.get('href')
    for a_element in article.select('a[href]'):
        if not first_link:
            first_link = a_element.attrs.get('href')
        links.update([a_element.attrs.get('href')])

    if len(links) == 0:
        return ''

    most_common = links.most_common()[0][0]
    return (links.most_common()[0][0] if
            (links.most_common()[0][1] > links.get(first_link, 0) and
             (most_common.startswith('/') or most_common.startswith('http')))
            else first_link)
Esempio n. 10
0
def get_event_description(div: bs4.element.Tag) -> str:
    for a in div.find_all("a"):
        a["href"] = BASE_API_URL + a["href"]
    data = (div.select("div.element-content > p"),
            div.find("div", class_="detail-content"))
    paragraphs = []
    for p in data[0]:
        contents = p.contents
        if not (len(contents) == 1 and len(contents[0].strip()) == 0):
            paragraphs.append(html.unescape(tomd.convert(str(p))))
    return "\n".join([p.strip().replace("\n", "") for p in paragraphs])
Esempio n. 11
0
def get_event_contents(div: bs4.element.Tag) -> List[str]:
    contains = div.select("div.detail-content > div")
    props = []
    for prop in contains:
        text = prop.text.replace("  ", " ")
        searchResult = propEx.search(text).groups()
        if searchResult[2] is not None:
            props.append(f"`{searchResult[0]}` - {searchResult[2]}")
        else:
            props.append(f"`{searchResult[0]}`")
    return props
Esempio n. 12
0
 def empirical_dispersion_parser(parsed_html: bs4.element.Tag) -> list:
     EMPIRICAL_DISPERSION = 'EmpiricalDispersion'
     result = {EMPIRICAL_DISPERSION: []}
     for item in [
             str(span.text)
             for span in parsed_html.select('span[class="kwit"]')
     ]:
         if EMPIRICAL_DISPERSION in item:
             if '=' in item:
                 result[EMPIRICAL_DISPERSION].append(item.split('=')[-1])
         else:
             result[item] = []
     return result
Esempio n. 13
0
    def extract_all_tags(self, tag: str, node: bs4.element.Tag):
        '''
        This function use the select operator defined by beautifulsoup to return the list of all tag @tag
        inside the node @node
        We can choose every node
        e.g.
        xml_code="<xml><function><if>if <condition>(<expr><name>var</name></expr>)</condition></if></function></xml>"
        parser=SrcmlParser(xml_code)
        tags=parser.extract_all_tags("if", parser.soup)
        print(len(tags))
        '''

        tags = node.select(tag)
        return tags
Esempio n. 14
0
    def pure_functionals_parser(parsed_html: bs4.element.Tag) -> list:
        def exchange_functionals(parsed_html: bs4.element.Tag) -> list:
            ul = section.find_next_sibling('ul')
            for item in ul.select('span[class="kwit"]'):
                yield item.text

        def correlation_functionals(parsed_html: bs4.element.Tag) -> list:
            # Correlation part has two sections
            ul = section.find_next_sibling('ul')
            for item in ul.select('span[class="kwit"]'):
                yield item.text
            ul = ul.find_next_sibling('ul')
            for item in ul.select('span[class="kwit"]'):
                yield item.text

        def standalone_functionals(parsed_html: bs4.element.Tag) -> list:
            ul = section.find_next_sibling('ul')
            for item in ul.select('span[class="kwit"]'):
                yield item.text

        exchange = []
        correlation = []
        standalone = []

        mapper = {
            'Exchange Functionals': (exchange_functionals, exchange),
            'Correlation Functionals': (correlation_functionals, correlation),
            'Standalone Pure Functionals': (standalone_functionals, standalone)
        }

        sections = parsed_html.select('h3[class="ksection"]')
        for section in sections:
            func, store = mapper[str(section.text)]
            store.extend(list(func(section)))

        # exchange * correlation
        result = []
        # XXX we have to consdier those stand-alone exchange functionals, like
        # S/HFS, XA/XAlpha, B/HFB, etc.
        for exchange_item in exchange:
            for correlation_item in correlation:
                result.append(exchange_item + correlation_item)

        result.extend(standalone)

        return result
Esempio n. 15
0
def extract_birthdays(ultag: bs4.element.Tag) -> list:
    '''ulタグ内のliから誕生日リストを全て取ってリストで返す'''
    ret = []
    li_elms = ultag.select("li")
    for litag in li_elms:
        brtdy: birthday.Birthday = birthday.parse_birthday(litag.get_text())
        if brtdy is None:
            continue

        yearstr = brtdy.yearstr if brtdy.year == birthday.YEAR_UNKNOWN else str(
            brtdy.year)

        print(yearstr + ": " + brtdy.name + "    (" + brtdy.occupation + ")")

        ret.append(brtdy)

    return ret
Esempio n. 16
0
    def from_html(cls, tr: bs4.element.Tag) -> "NsdiLandUsingInfo":
        td_list = tr.select("td")
        data_type = td_list[0].text.strip()
        city_type = td_list[1].text.strip()
        name_type = td_list[2].text.strip()
        base_date = td_list[3].text.strip()
        file_size = td_list[4].text.strip()
        button_value = td_list[5].select("button")[0]
        table_data = NsdiLandUsingInfo.NsdiTableData.from_html(button_value)

        return cls(data_type=data_type,
                   city_type=city_type,
                   name_type=name_type,
                   base_date=base_date,
                   file_size=file_size,
                   table_data=table_data,
                   raw_data=str(tr))
Esempio n. 17
0
File: yahoo.py Progetto: zdict/zdict
 def get_grammar(d: bs4.element.Tag):
     s = ('div#web ol.searchCenterMiddle '
          'div.dictionaryWordCard > ul > li')
     return list(map(text, d.select(s)))
Esempio n. 18
0
 def get_grammar(d: bs4.element.Tag):
     s = ('div#web ol.searchCenterMiddle '
          'div.dictionaryWordCard > ul > li')
     return list(map(text, d.select(s)))
Esempio n. 19
0
 def hybrid_functionals_parser(parsed_html: bs4.element.Tag) -> list:
     return [
         str(span.text) for span in parsed_html.select('span[class="kwit"]')
         if 'IOp' not in span.text
     ]