Python BeautifulSoup.find_next_siblingの例、bs4.BeautifulSoup.find_next_sibling Pythonの例

コード例 #1

0

ファイルを表示

 def get_meaning_with_etymology(self,
                                header: BeautifulSoup) -> [Dict[str, Any]]:
     result = {'etymology': None, 'values': []}
     next_sibling: BeautifulSoup = header.find_next_sibling()
     # p is etymology details, capture it
     while next_sibling.name == 'p':
         p: BeautifulSoup = header.find_next_sibling()
         etymology = result.get('etymology')
         result['etymology'] = p.get_text().strip() if etymology is None \
             else etymology + '\n' + p.get_text().strip()
         next_sibling = next_sibling.find_next_sibling()
     # Skip pronunciation headers
     while WiktionaryTrEnScraper.is_pronunciation_header(
             next_sibling) or next_sibling.name == 'ul':
         next_sibling = next_sibling.find_next_sibling()
     # h4 is the header for parts of speech
     if WiktionaryTrEnScraper.is_part_of_speech_header(next_sibling):
         span: BeautifulSoup = next_sibling.find('span')
         if span:
             result['part_of_speech'] = span.get_text().strip().lower()
         if next_sibling.name == 'h3':
             self.processed_headers.append(str(next_sibling))
         next_sibling = next_sibling.find_next_sibling()
         WiktionaryTrEnScraper.process_meaning_values(next_sibling, result)
     return result

コード例 #2

0

ファイルを表示

 def get_pronunciation(header: BeautifulSoup) -> List[Dict[str, Any]]:
     results = []
     if header.find_next_sibling().name == 'ul':
         ul: BeautifulSoup = header.find_next_sibling()
         WiktionaryTrEnScraper.get_pronunciation_type(
             ul, results, 'IPA', 'IPA')
         WiktionaryTrEnScraper.get_pronunciation_type(
             ul, results, 'Hyphenation', 'Latn')
     return results

コード例 #3

0

ファイルを表示

ファイル: flight_crawler.py プロジェクト: ToninoTarsi/igclib

    def crawl_pwca(self, task):
        date = datetime.strptime(task.date, '%Y-%m-%d')
        tag = re.compile(f'T.*{datetime.strftime(date, "%a %d %b %y")}')

        results_page = 'results/' if datetime.now().year == date.year else f'results_{date.year}/'
        URL = FLIGHT_PROVIDERS['PWCA']['BASE_URL'] + results_page
        
        page = requests.get(URL + 'results.htm')
        event = BeautifulSoup(page.text, 'lxml').find('b', string=tag)
        tracks_link = event.find_next_sibling('a').attrs.get('href', None)

        tracks = requests.get(URL + tracks_link, stream=True)
        file_size = int(tracks.headers.get('content-length', 0))

        with open(f'/tmp/{task.date}.zip', 'wb') as f:
            downloaded = 0
            with tqdm(total=file_size, desc='downloading_tracks', disable=self._progress!='gui') as pbar: 
                for data in tracks.iter_content(32*1024): 
                    f.write(data); 
                    if self._progress == 'ratio':
                        downloaded += len(data)
                        print(f'{downloaded/file_size:.0%}', file=sys.stderr, flush=True)
                    else:
                        pbar.update(len(data))

        tracks_dir = f'/tmp/{task.date}'
        z = zipfile.ZipFile(f'/tmp/{task.date}.zip')
        z.extractall(tracks_dir)
        os.remove(f'/tmp/{task.date}.zip')
        return tracks_dir

コード例 #4

0

ファイルを表示

ファイル: addon.py プロジェクト: isant46/plugin.video.dramacool

def _():
    response = request(plugin.pathquery)
    document = BeautifulSoup(response.text, 'html.parser').find(
        'ul', {'class': ['list-episode-item', 'list-star']})
    items = []

    if document is not None:
        if plugin.path == '/list-star.html' or (
                'type' in plugin.query and 'stars' in plugin.query['type']):
            for li in document.find_all('li', recursive=False):
                plot = li.find('ul')
                item = ListItem(li.find('img').attrs['alt'])
                item.setArt({'poster': li.find('img').attrs['data-original']})
                item.setInfo('video',
                             {'plot': '' if plot is None else plot.text})
                items.append(
                    (plugin.url_for(li.find('a').attrs['href']), item, True))
        elif plugin.path in ('/most-popular-drama', '/search'):
            idb.connect()

            for a in document.find_all('a'):
                path = a.attrs['href']
                drama = drama_detail(path)
                item = ListItem(drama['title'])
                item.setArt({'poster': drama.pop('poster')})
                item.setInfo('video', drama)
                items.append((plugin.url_for(path), item, True))

            idb.close()
        else:
            for a in document.find_all('a'):
                item = ListItem(u'[{}] {} {}'.format(
                    a.find('span', {
                        'class': 'type'
                    }).text,
                    a.find('h3').text,
                    a.find('span', {
                        'class': 'ep'
                    }).text))
                item.setArt({'poster': a.find('img').attrs['data-original']})
                item.setInfo('video', {})
                item.setProperty('IsPlayable', 'true')
                items.append((plugin.url_for(a.attrs['href']), item, False))

        document = document.find_next_sibling()

        if document is not None:
            for li in document.find_all('li', {'class': ['next', 'previous']}):
                item = ListItem(
                    localized_str(33600 if li.text == 'Next >' else 33601))
                items.append(
                    (plugin.url_for(plugin.path + li.find('a').attrs['href']),
                     item, True))
                print('   -----  ' +
                      plugin.url_for(plugin.path + li.find('a').attrs['href']))

    xbmcplugin.setContent(plugin.handle, 'videos')
    xbmcplugin.addDirectoryItems(plugin.handle, items, len(items))
    xbmcplugin.endOfDirectory(plugin.handle)

コード例 #5

0

ファイルを表示

 def is_part_of_speech_header(header: BeautifulSoup) -> bool:
     next_sibling: BeautifulSoup = header.find_next_sibling()
     while next_sibling and next_sibling.name == 'table':
         next_sibling = next_sibling.find_next_sibling()
     if next_sibling and next_sibling.name == 'p':
         next_sibling = next_sibling.find_next_sibling()
         if next_sibling.name == 'ol':
             return True
     return False

コード例 #6

0

ファイルを表示

 def get_meaning_without_etymology(
         header: BeautifulSoup) -> [Dict[str, Any]]:
     result = {'etymology': None, 'values': []}
     span: BeautifulSoup = header.find('span')
     if span:
         result['part_of_speech'] = span.get_text().strip().lower()
         next_sibling: BeautifulSoup = header.find_next_sibling()
         while next_sibling.name == 'table':
             next_sibling = next_sibling.find_next_sibling()
         WiktionaryTrEnScraper.process_meaning_values(next_sibling, result)
     return result

コード例 #7

0

ファイルを表示

ファイル: get.py プロジェクト: JInghfut/example

def getip(url):
    index = requests.get(url)
    i = BeautifulSoup(index.content, 'lxml')
    iplist = i.find_all("img", {"title": "非常危险"})
    ipall = []
    for i in iplist:
        u = i.find_next_sibling('a')
        if u == None:
            continue
        ipall.append(u.getText())
    return ipall

コード例 #8

0

ファイルを表示

    def init_tab(self):
        driver.switch_to.window(self.tab)
        print(f'{self.title}: Waiting the page to load')
        driver.get(self.link)  # visit the link

        while True:
            try:
                if 'Stock' in self.title:
                    driver.execute_script('$("#stocksFilter").val("#all");')
                    driver.execute_script("doStocksFilter('select',this)")
                if 'Crypto' in self.title:
                    desired_quanity = BeautifulSoup(driver.page_source,
                                                    'html.parser')
                    desired_quanity = desired_quanity.find(
                        'span', text='Number of Currencies')
                    desired_quanity = int(
                        desired_quanity.find_next_sibling().get_text().replace(
                            ',', ''))
                else:
                    desired_quanity = self.object_.objects.count()
                break
            except Exception as e:
                print(e)

        while True:
            try:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                table = soup.find('table', class_=self.table_class)
                len_table = int(len(table.find_all('tr')) * 1.1)
                if len_table < desired_quanity:
                    print(
                        f'{self.title}: Waiting more... {len_table}/{desired_quanity}'
                    )
                    sleep(1)
                    continue
                break
            except AttributeError:
                sleep(1)

        # removing unnecessary elements
        classes_to_remove = ['generalOverlay', 'signupWrap', 'midHeader']

        driver.execute_script("$('header').remove()")
        driver.execute_script("$('footer').remove()")
        driver.execute_script("$('#rightColumn').remove()")

        for cl in classes_to_remove:
            # for skipping TimeoutException: Message: script timeout
            try:
                driver.execute_script(f"$('.{cl}').remove()")
            except:
                pass

        print(f'{self.title}: Tab is initializated!')

コード例 #9

0

ファイルを表示

def scrape_student_office():
    """Get info about the student service office"""

    scraped_info = {}
    student_office_url = "http://www.univaq.it/section.php?id=607"
    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
        "accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "accept-encoding": "gzip,deflate,sdch",
        "accept-language": "en-US,en;q=0.8",
    }

    request = requests.get(student_office_url, headers=headers)

    if request.status_code != 200:
        print("Error! Status " + request.status_code)
        return

    first_row = BeautifulSoup(request.text, "html.parser").find(string="AREA SCIENTIFICA")\
                .parent.parent.find_next_sibling().find("tr")

    address = first_row.find(class_="address_table_description").text
    phone = first_row.find_next_sibling().find(
        class_="address_table_description").text
    email = first_row.find_next_sibling().find_next_sibling()\
            .find(class_="address_table_description").text
    hours = first_row.find_next_sibling().find_next_sibling().find_next_sibling()\
            .find(class_="address_table_description").text.replace('\n', '')\
            .replace("13", "13, ")

    scraped_info.update({
        "indirizzo": address,
        "telefono": phone,
        "e-mail": email,
        "orari": hours
    })

    utils.write_json(scraped_info, "../json/student_office.json")

コード例 #10

0

ファイルを表示

ファイル: student_office.py プロジェクト: bradparks/UnivaqInformaticaBot

def scrape_student_office():
    """Get info about the student service office"""

    scraped_info = {}
    student_office_url = "http://www.univaq.it/section.php?id=607"
    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5)",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "accept-charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
        "accept-encoding": "gzip,deflate,sdch",
        "accept-language": "en-US,en;q=0.8",
    }

    request = requests.get(student_office_url, headers=headers)

    if request.status_code != 200:
        print("Error! Status "+request.status_code)
        return

    first_row = BeautifulSoup(request.text, "html.parser").find(string="AREA SCIENTIFICA")\
                .parent.parent.find_next_sibling().find("tr")

    address = first_row.find(class_="address_table_description").text
    phone = first_row.find_next_sibling().find(class_="address_table_description").text
    email = first_row.find_next_sibling().find_next_sibling()\
            .find(class_="address_table_description").text
    hours = first_row.find_next_sibling().find_next_sibling().find_next_sibling()\
            .find(class_="address_table_description").text.replace('\n', '')\
            .replace("13", "13, ")

    scraped_info.update({
        "indirizzo": address,
        "telefono": phone,
        "e-mail": email,
        "orari": hours
    })

    utils.write_json(scraped_info, "../json/student_office.json")

コード例 #11

0

ファイルを表示

 def _link_to_responce(self, link):
     """
     We only want to scrap Canada's public health.
     Many other links go to responses for financial aid an other public sites.
     """
     if link[0] is not '/':
         return None
     try:
         html = requests.get('https://www.canada.ca' + link)
         soup = BeautifulSoup(html.content,
                              'lxml').find(['h2', 'h3'], {
                                  'id': link.split('#')[1]
                              }).find_next_sibling()
         responce = str(soup)
         while soup.find_next_sibling(
         ) is not None and soup.find_next_sibling().name not in [
                 'h2', 'h3', 'div'
         ]:
             soup = soup.find_next_sibling()
             responce += " " + str(soup)
         return responce
     except BaseException:
         return None

コード例 #12

0

ファイルを表示

ファイル: extrator.py プロジェクト: senaluisgf/Projeto_crawler

    def extrairPartesProcesso(self, sopa):
        try:
            self.PARTES_PROCESSO = []
            partes_processo = sopa.find(id='tableTodasPartes').prettify()
            soupa = BeautifulSoup(partes_processo, 'html.parser').tr
            filho = BeautifulSoup(soupa.prettify(), 'html.parser').td
            filhos = filho.find_next_sibling()
            irmaos = soupa.find_next_siblings()
            junta = ''
            junta += filho.span.text.replace(":", ":|").strip()

            junta += filhos.text.strip().split('\n')[0].strip() + "|"
            for indice in range(1, len(filhos.text.strip().split('\n'))):
                conteudo = filhos.text.strip().split('\n')
                if conteudo[indice]:
                    junta += (conteudo[indice].strip() + "'")

            tratamento = junta.replace("''", "','")
            self.PARTES_PROCESSO.append(tratamento)

            for irmao in irmaos:
                filho = BeautifulSoup(irmao.prettify(), 'html.parser').td
                filhos = filho.find_next_sibling()
                junta = ''
                junta += filho.span.text.replace(":", ":|").strip()
                junta += filhos.text.strip().split('\n')[0].strip() + "|"
                for indice in range(1, len(filhos.text.strip().split('\n'))):
                    conteudo = filhos.text.strip().split('\n')
                    if conteudo[indice]:
                        junta += (conteudo[indice].strip() + "'")

                tratamento = junta.replace("''", "','").replace("'", '')
                self.PARTES_PROCESSO.append(tratamento)
        except Exception as e:
            self.STATUS = False
            print("Nao foi possivel coletar partes do processo")
            print(e)

コード例 #13

0

ファイルを表示

ファイル: soup.py プロジェクト: ifrpl/toddler

 def find_next_sibling(self, soup: BeautifulSoup, *args, **kwargs):
     return soup.find_next_sibling(*args, **kwargs)

コード例 #14

0

ファイルを表示

ファイル: addon.py プロジェクト: N2Roar/roar-repository

def pagination():
    if plugin.path == '/search' and 'keyword' not in plugin.query:
        keyboard = Keyboard()
        keyboard.doModal()

        if keyboard.isConfirmed():
            response = request(plugin.pathqs + '&keyword=' +
                               keyboard.getText())
        else:
            return
    else:
        response = request(plugin.pathqs)

    document = BeautifulSoup(response.text, 'html.parser').find(
        'ul', {'class': ['list-episode-item', 'list-star']})
    items = []

    if document is not None:
        if plugin.path in ('/list-star.html', '/most-popular-drama',
                           '/search'):
            if plugin.path == '/list-star.html' or ('type' in plugin.query
                                                    and 'stars'
                                                    in plugin.query['type']):
                for li in document.find_all('li', recursive=False):
                    plot = li.find('ul')
                    item = ListItem(li.find('img').attrs['alt'])
                    item.setArt(
                        {'poster': li.find('img').attrs['data-original']})
                    item.setInfo('video',
                                 {'plot': '' if plot is None else plot.text})
                    items.append((plugin.url_for(li.find('a').attrs['href']),
                                  item, True))
            else:
                InternalDatabase.connect()

                for a in document.find_all('a'):
                    path = a.attrs['href']
                    drama = drama_detail(path)
                    item = ListItem(drama['title'])
                    item.setArt({'poster': drama.pop('poster')})
                    item.setInfo('video', drama)
                    items.append((plugin.url_for(path), item, True))

                InternalDatabase.close()
        else:
            for a in document.find_all('a'):
                item = ListItem(u'[{}] {} {}'.format(
                    a.find('span', {
                        'class': 'type'
                    }).text,
                    a.find('h3').text,
                    a.find('span', {
                        'class': 'ep'
                    }).text))
                item.setArt({'poster': a.find('img').attrs['data-original']})
                item.setInfo('video', {})
                item.setProperty('IsPlayable', 'true')
                items.append((plugin.url_for(a.attrs['href']), item, False))

        document = document.find_next_sibling()

        if document is not None:
            for li in document.find_all('li', {'class': ['next', 'previous']}):
                item = ListItem(li.text)
                items.append(
                    (plugin.url_for(plugin.path + li.find('a').attrs['href']),
                     item, True))

    xbmcplugin.setContent(plugin.handle, 'videos')
    xbmcplugin.addDirectoryItems(plugin.handle, items, len(items))
    xbmcplugin.endOfDirectory(plugin.handle)

コード例 #15

0

ファイルを表示

    def live_on(self):
        driver.switch_to.window(self.tab)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        table = soup.find('table', class_=self.table_class)

        if 'Crypto' in self.title:
            desired_quanity = BeautifulSoup(driver.page_source, 'html.parser')
            desired_quanity = desired_quanity.find('span',
                                                   text='Number of Currencies')
            desired_quanity = int(
                desired_quanity.find_next_sibling().get_text().replace(
                    ',', ''))
        else:
            desired_quanity = self.object_.objects.count()
        while True:
            try:
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                table = soup.find('table', class_=self.table_class)
                len_table = int(len(table.find_all('tr')) * 1.1)
                if len_table < desired_quanity:
                    print(
                        f'{self.title}: Waiting more... {len_table}/{desired_quanity}'
                    )
                    sleep(1)
                    continue
                break
            except AttributeError:
                sleep(1)

        link_list = self.link_list
        for tr in table.find_all('tr')[1:]:
            if tr.find('a') is None:
                continue
            link = 'https://www.investing.com' + tr.find('a')['href']
            if not link in link_list:
                continue
            tds = []

            if self.type_ == 'crncy':
                for td in tr.find_all('td')[2:]:
                    tds.append(td.get_text().strip())
            elif self.type_ == 'crptcrncy':
                for td in tr.find_all('td')[4:]:
                    tds.append(td.get_text().strip())
            elif self.type_ == 'etf' or self.type_ == 'fnd':
                for td in tr.find_all('td')[3:-1]:
                    tds.append(td.get_text().strip())
            else:
                for td in tr.find_all('td')[2:-1]:
                    tds.append(td.get_text().strip())

            now = timezone.now()

            is_closed = True

            if self.type_ == 'crncy' and len(
                    tr.find_all('td')[-1].get_text()) <= 5:
                is_closed = True
            elif self.type_ == 'crncy' or self.type_ == 'crptcrncy' or 'greenClockIcon' in tr.find_all(
                    'td')[-1].span['class']:
                is_closed = False
            elif 'redClockIcon' in tr.find_all('td')[-1].span['class']:
                is_closed = True

            if not is_closed:
                # if the market is open collect the live data
                live_data = {}
                l = []
                for key, value in live_fields.items():
                    l += value
                all_live_fields = list(set(l))

                for field in all_live_fields:
                    live_data[field] = None

                # Overiding neccessary fields
                for key, value in zip(self.live_fields, tds):
                    if value in '  -N/A':
                        live_data[key] = None
                    else:
                        live_data[key] = value

                if self.type_ != 'bnd':
                    if live_data['Last'] == self.last_price:
                        # if market is closed but clock icon haven't changed
                        # check market condition buy checking if the price is moving

                        # 1. Navigate current tab to the blank page
                        # 2. call init_tab()
                        # 3. break - go to the next tab
                        driver.get('about:blank')
                        self.__class__.init_tab(self)
                        break

                if not self.type_ in ['bnd', 'crptcrncy']:
                    try:
                        live_data['Prev. Close'] = round(
                            float(live_data['Last'].replace(',', '')) -
                            float(live_data['Chg.']), 2)
                    except:
                        pass
                models.AllAssetsLive.objects.filter(link=link).delete()
                try:
                    if self.type_ == 'crptcrncy':
                        time_ = now
                    elif len(live_data['Time']) <= 5:
                        time_ = datetime.datetime.strptime(
                            str(now.year) + str(live_data['Time']), '%Y%d/%m')
                    else:
                        time_ = datetime.datetime.strptime(
                            timezone.now().date().strftime('%Y:%m:%d:') +
                            str(live_data['Time']), '%Y:%m:%d:%H:%M:%S')
                        time_ = timezone.make_aware(time_)
                except:
                    time_ = None
                try:
                    if self.type_ == 'cmdty':
                        if live_data['Month'] is None:
                            pass
                        elif live_data['Month'] in '  ':
                            live_data['Month'] = None
                        else:
                            live_data['Month'] = datetime.datetime.strptime(
                                live_data['Month'], '%b %y')
                except:
                    live_data['Month'] = None
                if not '%' in live_data['Chg. %']:
                    live_data['Chg. %'] += '%'

                models.AllAssetsLive(
                    Type=self.type_,
                    link=link,
                    prev_close=validate_price(live_data['Prev. Close']),
                    last_price=validate_price(live_data['Last']),
                    month=validate_price(live_data['Month']),
                    Open=validate_price(live_data['Open']),
                    high=validate_price(live_data['High']),
                    low=validate_price(live_data['Low']),
                    change=validate_price(live_data['Chg.']),
                    change_7d=validate_price(live_data['Chg. (7D)']),
                    change_perc=validate_price(live_data['Chg. %']),
                    volume=validate_price(live_data['Vol.']),
                    market_cap=validate_price(live_data['Market Cap']),
                    Yield=validate_price(live_data['Yield']),
                    total_vol=validate_price(live_data['Total Vol.']),
                    total_assets=validate_price(live_data['Total Assets']),
                    time=time_).save(force_insert=True)
                self.last_price = live_data['Last']
                print(f'{self.title}: saved Live')

                for time_frame, hist_model in self.__class__.hist_objects.items(
                ):
                    if time_frame[-1] == 'D':
                        if self.last_obj_count[
                                time_frame] == 0 or now - datetime.timedelta(
                                    minutes=self.__class__.minutes_[time_frame]
                                ) > self.last_obj[time_frame].date:
                            # if there's no data at all or latest data is older (smaller) than needed
                            # send (Save) data
                            hist_model(
                                Type=self.type_,
                                link=link,
                                date=now,
                                price=validate_price(live_data['Last']),
                                Open=validate_price(live_data['Open']),
                                high=validate_price(live_data['High']),
                                low=validate_price(live_data['Low']),
                                volume=validate_price(live_data['Vol.']),
                            ).save(force_insert=True)
                            print(
                                f'{self.title}: saved HISTORICAL{time_frame}')
                    else:
                        # 6M1M, 1Y, 5Y, Max update last value
                        if self.last_obj[time_frame]:
                            if now.date == self.last_obj[time_frame].date:
                                hist_model.objects.filter(link=link).reverse(
                                ).first().delete()  # remove newest model
                        hist_model(
                            Type=self.type_,
                            link=link,
                            date=now,
                            price=validate_price(live_data['Last']),
                            Open=validate_price(live_data['Open']),
                            high=validate_price(live_data['High']),
                            low=validate_price(live_data['Low']),
                            volume=validate_price(live_data['Vol.']),
                        ).save(force_insert=True)
                        print(f'{self.title}: saved HISTORICAL{time_frame}')

                    if time_frame != 'Max':
                        # delete outdated data
                        if self.last_obj_count[time_frame]:
                            data1 = self.last_obj[time_frame].date
                            if time_frame[-1] == 'D':
                                data2 = now
                            else:
                                data2 = now.date()
                            diff = data2 - data1
                            days, seconds = diff.days, diff.seconds
                            hours = days * 24 + seconds // 3600
                            if hours > self.__class__.hours_[time_frame]:
                                hist_model.objects.filter(link=link).first(
                                ).delete()  # remove oldest model

            elif is_closed:
                # check whether "after live data" for today is available
                last_obj_after_count = models.AllAssetsAfterLive.objects.filter(
                    link=link).count()
                if last_obj_after_count > 0:
                    last_obj_after = models.AllAssetsAfterLive.objects.filter(
                        link=link).order_by('-id')[0]

                if last_obj_after_count == 0 or (
                    (now.date() - last_obj_after.date).days >= 1):
                    after_live_threads.append({
                        'link': link,
                        'type': self.type_,
                        'after fields': self.after_fields,
                        'title': self.title
                    })
            else:
                print('Time Icon is not found/recognized')

コード例 #16

0

ファイルを表示

#coding=utf-8

from bs4 import BeautifulSoup
import requests
import pyquery

content = requests.get('http://www.baidu.com').text
soup = BeautifulSoup(content, 'lxml')
soup.find_next_sibling()

コード例 #17

0

ファイルを表示

ファイル: BeautifulSoup2.py プロジェクト: sevenbean/Machine-Learning

r=requests.get("http://python123.io/ws/demo.html")
soup=BeautifulSoup(r.text,"html.parser")
print(soup.prettify())
#读取这个文件中的所有标签：
for tag in soup.find_all(True):
    print(tag.name)
#查找所有的a标签
for i in soup.find_all("a"):
    print(i.attrs["href"])
# 查找a,b标签
for i in soup.find_all(["a","b"]):
    print(i)
#查找所有以B开头的字符串
for i in soup(re.compile("b")):
    print(i.name)
# 查找标签为P，属性值为"title"的数据
for i in soup.find_all("a","py1"):
    print(i)
#查找class="py1"的标签
for i in soup.find_all(id="link1"):
    print(i.name)
print(soup.find_next_sibling("p","title"))
print("*"*50)
#查找P标签的父标签
for i in soup.p.find_parents():
    print(i.name)
print(soup.p.find_next_sibling())
print("*"*20)
print(soup.p.string)
for i in soup.find_all("p","course")[0].children:
    print(i)