Python _bs 예제들, bs4._bs Python 예제들

예제 #1

0

파일 보기

def request_url(ticker, action='getcompany', owner='exclude', type=None,
                    dateb=None, start=None, output=None, count=None,
                    **kwargs):
    cik = get_cik(ticker)
    params = {'action': action,
                'CIK': cik,
                'owner': owner}
    if type is not None:
        params['type'] = type
    elif dateb is not None:
        params['dateb'] = dateb
    elif start is not None:
        params['start'] = start
    elif output is not None:
        params['output'] = output
    elif count is not None:
        params['count'] = count
    else:
        pass

    response = _re.get(url = _base_url, params=params)
    soup = _bs(response.content, 'html.parser')
    print('Request Succesful')
    #print(response.url)
    return response.url, soup

예제 #2

0

파일 보기

파일: IMDB_scraper.py 프로젝트: jjhiggz/IMDB_Scraper_Actor_Data

def get_project_data(project_input):
  project_url = get(project_input)
  content = project_url.content
  parsed_content = _bs(content, 'lxml')
  cast_list = parsed_content.find(class_="cast_list").find_all(class_=['odd','even']) if parsed_content.select('[class="cast_list"]') else False
  poster = parsed_content.find(class_="poster").find('img') if parsed_content.select('[class="poster"]') else False
  actors = []
  if( poster and cast_list ):
    for actor in cast_list:
      actors.append({
        "name": actor.find('a').getText(strip=True) if actor.select('a') else 'not found',
        "character": actor.find(class_="character").getText(strip=True) if actor.select('[class="character"]') else 'not found',
        "image": actor.find('img')['src'],
        "actor_url": 'https://www.imdb.com/' + actor.find('a')['href'] if actor.select('a[href]') else 'not found'
      }) 
    return { 
      'image': poster['src'],
      'actors': actors,
      'storyline': parsed_content.find(class_="summary_text").getText(strip=True),
    }
  else:
    return {
      'image': 'not found',
      'actors': 'not found',
      'storyline': 'not found'
    }

예제 #3

0

파일 보기

파일: legacy_parser.py 프로젝트: rlbr/show_scraper

def parse(html):
    #output container
    show_dict = {}

    #create BeautifulSoup object
    bs = _bs(html, 'lxml')

    #find all listings of seasons in wikipedia table
    seasons = list(
        map(
            lambda x: re.search(r'[Ss]eason (\d+)', x.text).group(1),
            bs.find_all(lambda tag: 'season' in tag.text.lower(),
                        {'class': 'mw-headline'})))

    #finds all tables with specific class, this is common in my experience but not enough so to trust it. Why did I make it an iterator??
    tables = iter(bs.find_all('table', {'class': 'wikiepisodetable'}))

    #main iteration
    for season, table in zip(seasons, tables):
        #this is like the season
        show_dict[season] = {}

        #finds the header to the table
        _headers = table.find('tr').find_all('th')
        #forms them into a python list. This will serve
        headers = list(
            map(lambda x: repl.sub('', replace_with_char(x, ' ')), _headers))

        #iterates over everything besides the headers to the table
        for row in table.find_all('tr', {'class': 'vevent'}):
            _row = row
            row = list(
                map(lambda x: repl.sub('', replace_with_char(x)),
                    row.find_all(['th', 'td'])))
            row = dict(zip(headers, row))

            #this was a special case thing for nested episode listing? Like two in one row
            if any('\n' in v for v in row.values()):
                for row in spec_ops(row):
                    show_dict[season][row['No. in season']] = row

            else:
                show_dict[season][row['No. in season']] = row

    return show_dict

예제 #4

0

파일 보기

 async def growth(self, ctx):
     """
     Fetches some growth stats from the SCStat website.
     """
     async with session.get(orgstats_url) as response:
         soup = _bs(await response.text(), features='lxml')
         orgs_raw = soup.tbody.find_all('tr')
         orgs_cleaner = {}
         for org in orgs_raw:
             org_entry = []
             for line in org.stripped_strings:
                 try:
                     to_append = int(line)
                 except ValueError:
                     to_append = line
                 org_entry.append(to_append)
             orgs_cleaner[org_entry[1]] = org_entry
         print(orgs_cleaner)
         del (orgs_cleaner['Spectrum ID'])
         corp = orgs_cleaner['CORP']
         orgs_clean = sorted(orgs_cleaner.items(),
                             key=lambda x: x[1][3],
                             reverse=True)
         print(corp)
         to_send = 'Top orgs by monthly new members:\n```\n'
         for org in orgs_clean[:5]:
             if org[0] == 'TEST':
                 org[1][2] = 'Test Squadron'
             to_send += f'{org[1][0]}) {org[1][2]} ({org[0]})  + {org[1][3]} new members in the past month.\n'
         if corp[0] > 5:
             to_send += f'{corp[0]}) {corp[2]} ({corp[1]})  + {corp[3]} new members in the past month.\n```\n' \
                        f'We should be higher on the list!'
         else:
             to_send += '```\nGood work, but lets see if we can get to #1!'
         await ctx.send(to_send)
         print(to_send)

예제 #5

0

파일 보기

def fetch_citizen(name,
                  url=DEFAULT_RSI_URL,
                  endpoint='/citizens',
                  skip_orgs=False):
    result = {}
    url = url.rstrip('/')
    citizen_url = "{}/{}/{}".format(url.rstrip('/'), endpoint.strip('/'), name)
    orgapiurl = '{}/{}'.format(url.rstrip('/'), 'api/orgs/getOrgMembers')

    page = _requests.get(citizen_url, headers=None)
    print(page)
    if page.status_code == 404:
        print(f'Received a 404 Error Code from query to {citizen_url}.')
    if page.status_code == 200:
        soup = _bs(page.text, features='lxml')
        _ = [_.text for _ in soup.select(".info .value")[:3]]
        result['username'] = get_item(_, 0, '')
        result['handle'] = get_item(_, 1, '')
        result['title'] = get_item(_, 2, '')
        result['title_icon'] = get_item(soup.select(".info .icon img"), 0, '')
        if result['title_icon']:
            result['title_icon'] = '{}/{}'.format(url,
                                                  result['title_icon']['src'])
        result['avatar'] = "{}/{}".format(
            url,
            soup.select('.profile .thumb img')[0]['src'].lstrip('/'))
        result['url'] = citizen_url

        if soup.select('.profile-content .bio'):
            result['bio'] = soup.select('.profile-content .bio')[0].text.strip(
                '\nBio').strip()
        else:
            result['bio'] = ''
        result['citizen_record'] = soup.select(
            '.citizen-record .value')[0].text
        try:
            result['citizen_record'] = int(result['citizen_record'][1:])
        except:
            print(
                'Encountered unexpceted citizen_record. Making citizen_record 1000000000.'
            )
            result['citizen_record'] = 1000000000
            pass

        _ = {
            _.select_one('span').text:
            _re.sub(r'\s+', ' ',
                    _.select_one('.value').text.strip()).replace(' ,', ',')
            for _ in soup.select('.profile-content > .left-col .entry')
        }
        result['enlisted'] = get_item(_, 'Enlisted', '')
        result['location'] = get_item(_, 'Location', '')
        result['languages'] = get_item(_, 'Fluency', '')
        result['languages'] = result['languages'].replace(',', '').split()

        if not skip_orgs:
            orgs_page = _requests.get("{}/organizations".format(citizen_url))
            if orgs_page.status_code == 200:
                orgsoup = _bs(orgs_page.text, features='lxml')
                result['orgs'] = []
                for org in orgsoup.select('.orgs-content .org'):
                    orgname, sid, rank = [
                        _.text for _ in org.select('.info .entry .value')
                    ]
                    if orgname[0] == '\xa0':
                        orgname = sid = rank = 'REDACTED'

                    roles = []
                    r = _requests.post(orgapiurl,
                                       data={
                                           'symbol': sid,
                                           'search': name
                                       })
                    if r.status_code == 200:
                        r = r.json()
                        if r['success'] == 1:
                            apisoup = _bs(r['data']['html'], features='lxml')
                            roles = [
                                _.text
                                for _ in apisoup.select('.rolelist .role')
                            ]

                    orgdata = {
                        'name': orgname,
                        'sid': sid,
                        'rank': rank,
                        'roles': roles,
                    }
                    try:
                        orgdata['icon'] = '{}/{}'.format(
                            url,
                            org.select('.thumb img')[0]['src'].lstrip('/'))
                    except IndexError:
                        pass

                    result['orgs'].append(orgdata)
    return result

예제 #6

0

파일 보기

파일: _main.py 프로젝트: williamstein92/easyaspy

def parse(url):
	"Parse url into BeautifulSoup object."
	return _bs(url,"lxml")

예제 #7

0

파일 보기

파일: revdns.py 프로젝트: spwilson2/router-logger

def BeautifulSoup(html, *args, **kwargs):
    """Automatically provide the parser to BeautifulSoup Method"""
    return _bs(html, 'html.parser', *args, **kwargs)

예제 #8

0

파일 보기

파일: IMDB_scraper.py 프로젝트: jjhiggz/IMDB_Scraper_Actor_Data

from bs4 import BeautifulSoup as _bs
from requests import get
from IPython import embed
import pprint
from create_file import create_actor_file 

pp = pprint.PrettyPrinter(indent=1)

# change this input to the IMDB url for the actor you want data in

input = 'https://www.imdb.com/name/nm3836977/?ref_=tt_cl_t1'

url = get(input)
content = url.content
parsed = _bs(content, 'lxml')
projects = parsed.find_all(class_="filmo-row")
header = parsed.find(class_="header")

# the initial actor data dictionary which gets updated as you run the file


actor_data = {
  "name": header.find(class_="itemprop").getText(strip=True),
  "age": 0,
  "image": parsed.find(id="name-poster")['src'] if parsed.find(id="name-poster") else '',
  "projects": {},
  "project_references": []
}

# this function will acquire the data for a given project (movie, show, ect...)

예제 #9

0

파일 보기

파일: crawler.py 프로젝트: j178/neu6v-crawler

import re
from urllib.parse import urljoin

from bs4 import BeautifulSoup as _bs

BeautifulSoup = lambda x: _bs(x, 'lxml')

from session import session

WATER_ZONE_URL = 'http://bt.neu6.edu.cn/forum-4-1.html'


def check_new_topic(html):
    soup = BeautifulSoup(html)
    for topic in soup.find_all('tbody', id=re.compile(r'^normalthread_(\d+)')):
        if topic.find('em').text == '[六维茶话]':
            a = topic.find('a', class_='s xst', style=None)
            if a:
                print(urljoin(WATER_ZONE_URL, a['href']), a.text)


def reply_new(b):
    """
    回复新主题
    :return:
    """


def reply_comment():
    """
    回复别人对我的评论