def request_url(ticker, action='getcompany', owner='exclude', type=None, dateb=None, start=None, output=None, count=None, **kwargs): cik = get_cik(ticker) params = {'action': action, 'CIK': cik, 'owner': owner} if type is not None: params['type'] = type elif dateb is not None: params['dateb'] = dateb elif start is not None: params['start'] = start elif output is not None: params['output'] = output elif count is not None: params['count'] = count else: pass response = _re.get(url = _base_url, params=params) soup = _bs(response.content, 'html.parser') print('Request Succesful') #print(response.url) return response.url, soup
def get_project_data(project_input): project_url = get(project_input) content = project_url.content parsed_content = _bs(content, 'lxml') cast_list = parsed_content.find(class_="cast_list").find_all(class_=['odd','even']) if parsed_content.select('[class="cast_list"]') else False poster = parsed_content.find(class_="poster").find('img') if parsed_content.select('[class="poster"]') else False actors = [] if( poster and cast_list ): for actor in cast_list: actors.append({ "name": actor.find('a').getText(strip=True) if actor.select('a') else 'not found', "character": actor.find(class_="character").getText(strip=True) if actor.select('[class="character"]') else 'not found', "image": actor.find('img')['src'], "actor_url": 'https://www.imdb.com/' + actor.find('a')['href'] if actor.select('a[href]') else 'not found' }) return { 'image': poster['src'], 'actors': actors, 'storyline': parsed_content.find(class_="summary_text").getText(strip=True), } else: return { 'image': 'not found', 'actors': 'not found', 'storyline': 'not found' }
def parse(html): #output container show_dict = {} #create BeautifulSoup object bs = _bs(html, 'lxml') #find all listings of seasons in wikipedia table seasons = list( map( lambda x: re.search(r'[Ss]eason (\d+)', x.text).group(1), bs.find_all(lambda tag: 'season' in tag.text.lower(), {'class': 'mw-headline'}))) #finds all tables with specific class, this is common in my experience but not enough so to trust it. Why did I make it an iterator?? tables = iter(bs.find_all('table', {'class': 'wikiepisodetable'})) #main iteration for season, table in zip(seasons, tables): #this is like the season show_dict[season] = {} #finds the header to the table _headers = table.find('tr').find_all('th') #forms them into a python list. This will serve headers = list( map(lambda x: repl.sub('', replace_with_char(x, ' ')), _headers)) #iterates over everything besides the headers to the table for row in table.find_all('tr', {'class': 'vevent'}): _row = row row = list( map(lambda x: repl.sub('', replace_with_char(x)), row.find_all(['th', 'td']))) row = dict(zip(headers, row)) #this was a special case thing for nested episode listing? Like two in one row if any('\n' in v for v in row.values()): for row in spec_ops(row): show_dict[season][row['No. in season']] = row else: show_dict[season][row['No. in season']] = row return show_dict
async def growth(self, ctx): """ Fetches some growth stats from the SCStat website. """ async with session.get(orgstats_url) as response: soup = _bs(await response.text(), features='lxml') orgs_raw = soup.tbody.find_all('tr') orgs_cleaner = {} for org in orgs_raw: org_entry = [] for line in org.stripped_strings: try: to_append = int(line) except ValueError: to_append = line org_entry.append(to_append) orgs_cleaner[org_entry[1]] = org_entry print(orgs_cleaner) del (orgs_cleaner['Spectrum ID']) corp = orgs_cleaner['CORP'] orgs_clean = sorted(orgs_cleaner.items(), key=lambda x: x[1][3], reverse=True) print(corp) to_send = 'Top orgs by monthly new members:\n```\n' for org in orgs_clean[:5]: if org[0] == 'TEST': org[1][2] = 'Test Squadron' to_send += f'{org[1][0]}) {org[1][2]} ({org[0]}) + {org[1][3]} new members in the past month.\n' if corp[0] > 5: to_send += f'{corp[0]}) {corp[2]} ({corp[1]}) + {corp[3]} new members in the past month.\n```\n' \ f'We should be higher on the list!' else: to_send += '```\nGood work, but lets see if we can get to #1!' await ctx.send(to_send) print(to_send)
def fetch_citizen(name, url=DEFAULT_RSI_URL, endpoint='/citizens', skip_orgs=False): result = {} url = url.rstrip('/') citizen_url = "{}/{}/{}".format(url.rstrip('/'), endpoint.strip('/'), name) orgapiurl = '{}/{}'.format(url.rstrip('/'), 'api/orgs/getOrgMembers') page = _requests.get(citizen_url, headers=None) print(page) if page.status_code == 404: print(f'Received a 404 Error Code from query to {citizen_url}.') if page.status_code == 200: soup = _bs(page.text, features='lxml') _ = [_.text for _ in soup.select(".info .value")[:3]] result['username'] = get_item(_, 0, '') result['handle'] = get_item(_, 1, '') result['title'] = get_item(_, 2, '') result['title_icon'] = get_item(soup.select(".info .icon img"), 0, '') if result['title_icon']: result['title_icon'] = '{}/{}'.format(url, result['title_icon']['src']) result['avatar'] = "{}/{}".format( url, soup.select('.profile .thumb img')[0]['src'].lstrip('/')) result['url'] = citizen_url if soup.select('.profile-content .bio'): result['bio'] = soup.select('.profile-content .bio')[0].text.strip( '\nBio').strip() else: result['bio'] = '' result['citizen_record'] = soup.select( '.citizen-record .value')[0].text try: result['citizen_record'] = int(result['citizen_record'][1:]) except: print( 'Encountered unexpceted citizen_record. Making citizen_record 1000000000.' ) result['citizen_record'] = 1000000000 pass _ = { _.select_one('span').text: _re.sub(r'\s+', ' ', _.select_one('.value').text.strip()).replace(' ,', ',') for _ in soup.select('.profile-content > .left-col .entry') } result['enlisted'] = get_item(_, 'Enlisted', '') result['location'] = get_item(_, 'Location', '') result['languages'] = get_item(_, 'Fluency', '') result['languages'] = result['languages'].replace(',', '').split() if not skip_orgs: orgs_page = _requests.get("{}/organizations".format(citizen_url)) if orgs_page.status_code == 200: orgsoup = _bs(orgs_page.text, features='lxml') result['orgs'] = [] for org in orgsoup.select('.orgs-content .org'): orgname, sid, rank = [ _.text for _ in org.select('.info .entry .value') ] if orgname[0] == '\xa0': orgname = sid = rank = 'REDACTED' roles = [] r = _requests.post(orgapiurl, data={ 'symbol': sid, 'search': name }) if r.status_code == 200: r = r.json() if r['success'] == 1: apisoup = _bs(r['data']['html'], features='lxml') roles = [ _.text for _ in apisoup.select('.rolelist .role') ] orgdata = { 'name': orgname, 'sid': sid, 'rank': rank, 'roles': roles, } try: orgdata['icon'] = '{}/{}'.format( url, org.select('.thumb img')[0]['src'].lstrip('/')) except IndexError: pass result['orgs'].append(orgdata) return result
def parse(url): "Parse url into BeautifulSoup object." return _bs(url,"lxml")
def BeautifulSoup(html, *args, **kwargs): """Automatically provide the parser to BeautifulSoup Method""" return _bs(html, 'html.parser', *args, **kwargs)
from bs4 import BeautifulSoup as _bs from requests import get from IPython import embed import pprint from create_file import create_actor_file pp = pprint.PrettyPrinter(indent=1) # change this input to the IMDB url for the actor you want data in input = 'https://www.imdb.com/name/nm3836977/?ref_=tt_cl_t1' url = get(input) content = url.content parsed = _bs(content, 'lxml') projects = parsed.find_all(class_="filmo-row") header = parsed.find(class_="header") # the initial actor data dictionary which gets updated as you run the file actor_data = { "name": header.find(class_="itemprop").getText(strip=True), "age": 0, "image": parsed.find(id="name-poster")['src'] if parsed.find(id="name-poster") else '', "projects": {}, "project_references": [] } # this function will acquire the data for a given project (movie, show, ect...)
import re from urllib.parse import urljoin from bs4 import BeautifulSoup as _bs BeautifulSoup = lambda x: _bs(x, 'lxml') from session import session WATER_ZONE_URL = 'http://bt.neu6.edu.cn/forum-4-1.html' def check_new_topic(html): soup = BeautifulSoup(html) for topic in soup.find_all('tbody', id=re.compile(r'^normalthread_(\d+)')): if topic.find('em').text == '[六维茶话]': a = topic.find('a', class_='s xst', style=None) if a: print(urljoin(WATER_ZONE_URL, a['href']), a.text) def reply_new(b): """ 回复新主题 :return: """ def reply_comment(): """ 回复别人对我的评论