def _get_reviews(self, query, conn, cursor, num_scrolls=1, sleep=5, proxies=None): if proxies: handler = urllib.request.ProxyHandler(proxies) opener = urllib.request.build_opener(handler) urllib.request.install_opener(opener) id_ = self._get_id(query) url = f'https://www.imdb.com/title/{id_}/reviews/?ref_=tt_ql_urv' req = urllib.request.Request(url, headers=self.headers) resp = urllib.request.urlopen(req).read() bs4_page = _BeautifulSoup(resp, 'lxml') try: page_key = bs4_page.find('div', {'class': 'load-more-data'})['data-key'] except TypeError: print("ERROR: Excessive requests. Increase sleep time or using proxies for longer scraping") exit(1) for review in bs4_page.findAll('div', {'class': 'review-container'}): # Sometimes the rating doesn't render appropriately try: rating = review.find('span', {'class': 'rating-other-user-rating'}).find('span').text except Exception: rating = None cursor.execute("INSERT INTO REVIEWS VALUES (?,?,?,?,?)", (None, review.find('div', {'class': 'text'}).text, rating, review.find('span', {'class': 'display-name-link'}).text, review.find('a', {'class': 'title'}).text)) conn.commit() print(f"Sleeping for {sleep} seconds to avoid excessive requests") time.sleep(sleep) for _ in tqdm(range(num_scrolls)): pagination_url = f'https://www.imdb.com/title/{id_}/reviews/_ajax?ref_=undefined&paginationKey={page_key}' req = urllib.request.Request(pagination_url) resp = urllib.request.urlopen(req).read() bs4_page = _BeautifulSoup(resp, 'lxml') try: page_key = bs4_page.find('div', {'class': 'load-more-data'})['data-key'] except TypeError: print("ERROR: Excessive requests. Increase sleep time or using proxies for longer scraping") exit(1) for review in bs4_page.findAll('div', {'class': 'review-container'}): try: rating = review.find('span', {'class': 'rating-other-user-rating'}).find('span').text except Exception: rating = None cursor.execute("INSERT INTO REVIEWS VALUES (?,?,?,?,?)", (None, review.find('div', {'class': 'text'}).text, rating, review.find('span', {'class': 'display-name-link'}).text, review.find('a', {'class': 'title'}).text)) conn.commit() print(f"Sleeping for {sleep} seconds to avoid excessive requests") time.sleep(sleep)
def _update_usa(self): """ Update whitelist based on usa.gov """ print 'Getting agencies from usa.gov...' url_base = 'https://www.usa.gov' letters = _string.ascii_lowercase agency_dic = {} for letter in letters: url = url_base + '/federal-agencies/' + letter soup = _BeautifulSoup(_urlopen(url).read()) links_content = [l for l in soup.find_all('ul') if 'class' in l.attrs and 'one_column_bullet' in l['class']] if len(links_content) == 1: links_list = links_content[0].find_all('a') for agency_html in links_list: name_short = self._preprocess_name(agency_html.string) agency_dic[name_short] = {'html': agency_html, 'url': url_base + agency_html['href'], 'name_full': agency_html.string, 'source': 'usa.gov'} print agency_html.string elif len(links_content) == 0: pass else: raise ValueError('Too many list elements found! Please modify the HTML parser.') self.agency_dictionary.update(agency_dic)
def __init__(self, parent, login=False, username=None, password=None): self._parent = parent self.download_page_soup = None self.current_archive_id = None self.session = s = _requests.Session() self.login = l = login # If login requested, populated login info if l: # Set post parameters login_data = { 'username': username, 'password': password, 'action': 'auth', 'redirect': '/' } self._login_credentials_present(username, password) self._parent.throttle.throttle('page') r = s.post(_LOGIN_URL, data=login_data) if r.status_code != 200: raise ConnectionError( f'Encountered a problem connecting ' f'during ArchiveDownloader initialization:' f' code = {r.status_code}, login = {l}') # Check successful login soup = _BeautifulSoup(r.text, 'lxml') if soup(text='Log in Failed!'): raise NavigatorException(f'Login credentials rejected by the ' f'server.')
def _scrape_contents(self): ### Scrape the contents of the currently displayed calendar # Scrape the nav page soup = _BeautifulSoup(self._browser.page_source, 'lxml') # Isolate & store the calendar contents self._contents = soup.find('table', {'class': 'table-condensed'})
def _scrape_contents(self): ### Scrape the contents of the currently displayed calendar # Scrape the nav page soup = _BeautifulSoup(self._browser.page_source, 'lxml') # Isolate & store the ATT contents self._contents = soup.find('table', attrs={ 'id': 'archiveTimes' }).find('tbody')
def _get_id(self, query): url = f'https://www.imdb.com/find?' + urllib.parse.urlencode({'q': query, 'ref_': 'nv_sr_sm'}) req = urllib.request.Request(url, headers=self.headers) response = urllib.request.urlopen(req) bs4_page = _BeautifulSoup(response.read(), 'lxml') anchor_tag = bs4_page.find('td', {'class': 'result_text'}).find('a')['href'] return anchor_tag.split('/')[2]
def soup(content='', headers=None): if content.startswith('http'): content = url_get(content, headers=headers) if content: _soup = _BeautifulSoup(content, 'lxml') # soup.find('title', attrs={'itemprop': "name"}) return _soup return None
def _get_version_ids(self): base_url = 'http://uscode.house.gov/download/annualhistoricalarchives/downloadxhtml.shtml' soup = _BeautifulSoup(_urllib2.urlopen(base_url)) tags = [t for t in soup.find_all('a') if '.zip' in t['href']] id_vals = [_re.search('[0-9]+', t['href']).group(0) for t in tags] return id_vals
def _get_feed_name(feed_id): s = _requests.Session() with s: r = s.get(_FEED_URL_STEM + feed_id) if r.status_code != 200: raise ConnectionError(f'Problem connecting: {r.status_code}') soup = _BeautifulSoup(r.text, 'lxml') return soup.find('span', attrs={'class': 'px13'}).text
def _update_wikipedia(self): # do a little bit of name preprocessing here too from requests import ConnectionError from wikipedia import PageError print 'Getting data from Wikipedia...' page_current = _wikipedia.page('List_of_federal_agencies_in_the_United_States') html = page_current.html() subset = html[_re.search('<h2>.*?Legislative Branch', html).start():_re.search('<h2>.*?See also', html).start()] soup = _BeautifulSoup(subset) links = soup.find_all(lambda x: x.name == 'a' and x.has_attr('href') and '/wiki/' in x['href'] and 'File:' not in x['href']) agency_dic = {self._preprocess_name(link['title']): {'html': link, 'url': 'https://en.wikipedia.org' + link['href'], 'name_full': link['title'], 'source': 'wikipedia'} for link in links} category_pages = ['https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Defunct_agencies_of_the_United_States_government&cmlimit=500&format=json', 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Committees_of_the_United_States_Senate&cmlimit=500&format=json', 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Joint_committees_of_the_United_States_Congress' + '&cmlimit=500&format=json', 'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' + 'cmtitle=Category:Committees_of_the_United_States_House_of_Representatives' + '&cmlimit=500&format=json', ] for category_page in category_pages: content_defunct = _json.loads(_urlopen(category_page).read()) for result in content_defunct['query']['categorymembers']: if result['ns'] == 0: url_defunct = 'https://en.wikipedia.org/wiki/' + _re.sub(' ', '_', result['title']) print(result['title']) try: page_defunct = _wikipedia.page(result['title']) name_short = self._preprocess_name(result['title']) agency_dic[name_short] = {'html': page_defunct.html(), 'url': url_defunct, 'name_full': result['title'], 'source': 'wikipedia'} except (ConnectionError, PageError): print('Failed to get agency HTML!') self.agency_dictionary.update(agency_dic)
def get_download_soup(self, archive_id): self.current_archive_id = archive_id s = self.session self._parent.throttle.throttle() r = s.get(_ARCHIVE_DOWNLOAD_STEM + archive_id) if r.status_code != 200: raise ConnectionError( f'Problem connecting while getting soup from ' f'{_ARCHIVE_DOWNLOAD_STEM + archive_id}: {r.status_code}') self.download_page_soup = _BeautifulSoup(r.text, 'lxml') return self.download_page_soup
def login(self, login, password): """ Login to your router - you need to do this before you get any other site :param login: Your login. It can't be anything else than 'admin' so... :param password: Password to your admin """ # Get main site to get public RSA key login_r = self._session.get(self._url + '/loginpage.htm', verify=False) if not login_r.ok: raise ConnectionError # Scrape the key login_soup = _BeautifulSoup(login_r.content, features='html.parser') pub_key_txt = login_soup.find(id='divpem').text pub_key_txt = pub_key_txt.replace('\n', '').strip() # This is what the site does (in aes.js) before sending password # Why does it generate 16 random digits before password? # ¯\_(ツ)_/¯ pwdv = password + ':' + ''.join( _random.choice(_string.digits) for i in range(16)) # We need to use JavaScript engine ctx = _py_mini_racer.MiniRacer() # ...to execute code that encrypts password before sending. # I couldn't get it working in Python, so I just stole all required JS # and execute it :) ctx.eval((_pathlib.Path(__file__).parent / 'stolen_javascript.js').read_text()) pwd_hash = ctx.eval(f""" var key = RSA.getPublicKey("{pub_key_txt}"); RSA.encrypt("{pwdv}", key); """) # Get cookies by logging in auth_r = self._session.post( self._url + f'/log/in?un={_urllib_parse.quote(login)}&pw={_urllib_parse.quote(pwd_hash)}' f'&rd=%2Fuir%2Fdwrhome.htm&rd2=%2Fuir%2Floginpage.htm&Nrd=1&Nlmb=', verify=False) def _is_redirect_ok(location: str): question = location.index('?') return location[:question] != '/uir/loginpage.htm' if not auth_r.ok or not _is_redirect_ok( auth_r.history[0].headers['Location']): raise ConnectionError
def get_all_lyrics(artist, song, linesep='\n', timeout=None): """Retrieve a list of all the lyrics versions of a song.""" url = create_url(artist, song) response = _requests.get(url, timeout=timeout) soup = _BeautifulSoup(response.content, "html.parser") lyricboxes = soup.findAll('div', {'class': 'lyricbox'}) if not lyricboxes: raise LyricsNotFound('Cannot download lyrics') for lyricbox in lyricboxes: for br in lyricbox.findAll('br'): br.replace_with(linesep) return [lyricbox.text.strip() for lyricbox in lyricboxes]
def get_all_lyrics(artist, song, language='', linesep=' \n ', timeout=None): """Retrieve a list of all the lyrics versions of a song.""" url = create_url(artist, song, language) response = _requests.get(url, timeout=timeout) soup = _BeautifulSoup(response.content, "html.parser") lyricboxes = soup.findAll('div', {'class': 'lyricbox'}) if not lyricboxes: raise LyricsNotFound('Cannot download lyrics') for lyricbox in lyricboxes: for br in lyricbox.findAll('br'): br.replace_with(linesep) return [lyricbox.text.strip() for lyricbox in lyricboxes]
def _get_feed_name(self, feed_id): s = _requests.Session() with s: r = s.get(_FEED_URL_STEM + feed_id) if r.status_code != 200: raise ConnectionError( f'Problem connecting while getting feed name: ' f' {r.status_code}') soup = _BeautifulSoup(r.text, 'lxml') try: feed_name = soup.find('span', attrs={'class': 'px13'}).text except AttributeError: raise NavigatorException(f'Invalid feed_id ({feed_id}).') self.feed_name = feed_name
def _get_ids(self): id_vals = [] years = range(1988, 2017) for year in years: soup = _BeautifulSoup( _urlopen( 'http://www.legislation.gov.uk/ukpga/{0}'.format(year))) n_results = _re.search('has returned ([0-9]+) results', soup.text.lower()).group(1) id_vals += [ str(year) + '_' + str(i) for i in range(1, int(n_results) + 1) ] return id_vals
def bgip(): 'http://www.89ip.cn/index_<2,2>.html' load = _plug.load('bgip') allip = set() for html in load: soup = _BeautifulSoup(html, 'lxml') tbody = soup.find('tbody') for tr in tbody: l = [] for td in tr: for td_str in td: l.append(td_str.strip()) result = _plug.ip_match(l, protocol='http') if result: allip.add(result) return allip
def _get_data(self, publication_id): max_attempts = 10 attempts = 0 xml_content = None soup = None while attempts < max_attempts: search_id = _re.sub('_', '/', publication_id) try: xml_content = _urlopen( 'http://www.legislation.gov.uk/ukpga/{0}/data.xml'.format( search_id)).read() soup = _BeautifulSoup(xml_content, 'xml') break except: attempts += 1 if 'amendment' in soup.title.text.lower(): amend = True else: amend = False if 'repeal' in soup.title.text.lower(): repeal = True else: repeal = False if soup.EnactmentDate is not None: date = soup.EnactmentDate['Date'] elif soup.PrimaryPrelims is not None: date = soup.PrimaryPrelims['RestrictStartDate'] else: date = None print 'warning! No date found.' meta = _format_meta_entry(country=u'united_kingdom', title=soup.title.text, id=publication_id, date=date, type=u'annual', xml=xml_content, amendment=amend, repealed=repeal) return meta
def xici(): 'http://www.xicidaili.com/nn/<1,2>' load = _plug.load('xici') allip = set() for html in load: soup = _BeautifulSoup(html, 'lxml') td = soup.find_all('td', class_='country') for b in td: l = [] for d in b.next_siblings: if d.string != '\n': l.append(str(d.string)) if l != []: result = _plug.ip_match(l) if result: allip.add(result) return allip
def llip(): 'http://www.66ip.cn/<2,2>.html' load = _plug.load('llip') allip = set() for html in load: soup = _BeautifulSoup(html, 'lxml') tab = soup.find('table', bordercolor='#6699ff') tr_all = tab.find_all('tr') for tr in tr_all: l = [] for td in tr: for td_str in td.children: l.append(td_str) result = _plug.ip_match(l, protocol='http') if result: allip.add(result) return allip
def __scrape_nav_page(self): if self.verbose: print('\tScraping navigation page...') self.__check_browser() # Wait for page to render element = _WebDriverWait(self.browser, 10).until_not( _EC.text_to_be_present_in_element( (_By.XPATH, _first_uri_in_att_xpath), self.current_first_uri)) self.current_first_uri = self.__get_current_first_uri() # Scrape page content soup = _BeautifulSoup(self.browser.page_source, 'lxml') # Isolate the calendar and the archiveTimes table self.calendar_soup = soup.find('table', {'class': 'table-condensed'}) self.att_soup = soup.find('table', attrs={ 'id': 'archiveTimes' }).find('tbody')
def _update_register(self): print 'Getting agencies from the federal register...' url_base = 'https://www.federalregister.gov/agencies' soup = _BeautifulSoup(_urlopen(url_base)) links = soup.find_all(lambda x: x.name == 'li' and x.has_attr('data-filter-live') and not x.has_attr('class')) agency_dic = {} for link in links: agency = link.find('a') name_short = self._preprocess_name(agency.string) agency_dic[name_short] = {'html': agency, 'url': agency['href'], 'name_full': agency.string, 'source': 'federal_register'} print agency.string self.agency_dictionary.update(agency_dic)
def get_lyrics_for_all_languages(artist, song, linesep='\n', timeout=None): """Retrieve the lyrics of the song in all languages available""" url = create_url(artist, song, '') response = _requests.get(url, timeout=timeout) soup = _BeautifulSoup(response.content, "html.parser") lyricboxes = soup.find('table', {'class': 'banner banner-song'}) result = dict() result['default'] = get_lyrics_by_language(artist, song, '', linesep='\n', timeout=None) for a in lyricboxes.findAll('a', href=True): result[a.getText()] = get_lyrics_by_language(artist, song, a['href'].split('/')[-1], linesep='\n', timeout=None) return result
def parse_page_now(url, df, timeout=None): response = _requests.get(url, timeout=timeout) soup = _BeautifulSoup(response.content, "html.parser") data = soup.findAll('li', attrs={'class': 'category-page__member'}) if not data: raise LanguageNotFound('No such language') for div in data: links = div.findAll('a') for a in links: lyric_link = a['href'].strip('/wiki/') artist = lyric_link.split(":")[0] title = lyric_link.split(":")[1] if (artist == "Category"): continue df = df.append({ 'Artist': artist, 'Title': title }, ignore_index=True) if (soup.find('div', attrs={'class': 'category-page__pagination'}) == None): return df next_page_text = soup.find( 'div', attrs={ 'class': 'category-page__pagination' }).find( 'a', attrs={ 'class': 'category-page__pagination-next wds-button wds-is-secondary' }) if next_page_text != None: next_page_url = next_page_text['href'] df = parse_page_now(next_page_url, df) return df
def scrape(login: str, password: str): session = _re.session() login_site_r = session.get('https://mobilevikings.pl/en/account/login/') login_soup = _BeautifulSoup(login_site_r.content, features='html.parser') csrf_middle_token = login_soup.find('input', attrs={'name': 'csrfmiddlewaretoken'})['value'] payload = { 'csrfmiddlewaretoken': csrf_middle_token, 'next': '/mysims/', 'username': login, 'password': password } login_r = session.post( 'https://mobilevikings.pl/en/account/login/', data=payload, headers={'Referer': 'https://mobilevikings.pl/en/account/login/'}) json_r = session.get( 'https://mobilevikings.pl/mysims/sim/148128/balance/json/', headers={'Referer': login_r.url}) return _json.loads(json_r.content)
def scrape(no_cache=False, cache_file_name='vote-results.json', cache_expire_time=24 * 60 * 60): """ This function downloads site http://ewybory.eu/sondaze and scrapes it for support data. Then, it saves the results in cache file (named vote-results.json by default), and uses this file for next 24 hours. If some party support can't be read for some reason, it will be -1 :param no_cache: Don't save results in cache file :param cache_file_name: Alternative cache file path :param cache_expire_time: Time (in seconds) after the cache file will be discarded and site will be downloaded again :return: Dict with results """ result = { 'success': False, 'support': { 'pis': -1, 'ko': -1, 'lewica': -1, 'konfederacja': -1, 'psl': -1, 'polska2050': -1 }, 'growth': { 'pis': 0, 'ko': 0, 'lewica': 0, 'konfederacja': 0, 'psl': 0, 'polska2050': 0 } } get_site = False try: if no_cache: raise ValueError modify = _os.path.getmtime(cache_file_name) if modify < _time.time() - cache_expire_time: # If cache file is older than 24h raise IOError with open(cache_file_name, 'rb') as f: print('Got results from cache') return _json.load(f) except FileNotFoundError: print('Cache file not found!') get_site = True except IOError: print('Cache file toot old!') get_site = True except ValueError: print('No-cache set to true, not touching cache files!') get_site = True if get_site: print('Getting the site from internet...') res = _get_site() if res is None: print("Can't get the site! Most probably no internet :/") return result soup = _BeautifulSoup(res.content, 'html.parser') print('Parsing with soup...') div = soup.find('div', class_='entry-content clearfix') table = div.find('table') tr = table.find('tr') ths = tr.find_all('th', class_='name_party_poll') name_party = { 'pis': 'pis', 'ko': 'ko', 'lewica': 'lewica', 'konfederacja': 'konfederacja', 'psl': 'psl', 'polska 2050': 'polska2050', 'n.solidarność': 'nowasolidarnosc' } for i in range(len(name_party)): party = name_party.get(_get_name(ths[i]).lower(), None) if party is None: print("Looks like some unknown party is on graph? " "It's possible that this means that this repo needs update - " "feel free to make an Issue on GitHub about that :)") continue sup = _get_sup(ths[i]) result['support'][party] = sup[0] result['growth'][party] = sup[1] result['success'] = True if not no_cache: with open(cache_file_name, 'w') as f: _json.dump(result, f) return result
def _get_data(self, publication_id): import bs4 search_term = _re.sub('_', '/', publication_id) text_soup = None text_content = None try: text_url = 'https://www.congress.gov/bill/{0}/text'.format( search_term) text_soup = _BeautifulSoup(_urlopen(text_url)) except: pass if text_soup is not None: if text_soup.find('pre') is not None: text_content = str(text_soup.find('pre')) else: text_content = str( text_soup.find('table', attrs={'class': 'lbexTableStyleEnr'})) meta_url = 'https://www.congress.gov/bill/{0}/all-info'.format( search_term) meta_soup = _BeautifulSoup(_urlopen(meta_url)) title = _re.search( ': (.*)', meta_soup.find('meta', attrs={'name': 'dc.title'})['content']) if title is not None: title = title.group(1) date = meta_soup.find('meta', attrs={'name': 'dc.date'})['content'] sponsor = meta_soup.find('meta', attrs={'name': 'dc.creator'}) if sponsor is not None: sponsor = sponsor['content'] sponsor_party = _re.search(sponsor + ' \[([A-Z])', meta_soup.text) if sponsor_party is not None: sponsor_party = sponsor_party.group(1) else: sponsor_party = None cosponsors = [ tag.text for tag in meta_soup.find_all('a', href=True) if 'member/' in tag['href'] and sponsor not in tag.text ] policy_area = _re.search('Policy Area:\s*(.*)', meta_soup.text) if policy_area is not None: policy_area = policy_area.group(1) committee_entries = meta_soup.find_all('tr', class_='committee') referred = [entry.find('th').text for entry in committee_entries] hearings_held = [] for entry in committee_entries: committee_name = entry.find('th').text actions = [entry.find_all('td')[1].text] entry = entry.next_sibling while type(entry) == bs4.element.Tag and ( 'class' not in entry.attrs or 'committee' not in entry['class']): actions.append(entry.find_all('td')[1].text) entry = entry.next_sibling if type(entry) == bs4.element.NavigableString: break hearings = [action for action in actions if 'Hearing' in action] hearings_held += [committee_name] * len(hearings) if 'amend' in title: amendment = True else: amendment = False if 'resolution' in publication_id: subtype = u'resolution' else: subtype = u'law' meta = _format_meta_entry(country=u'united_states', title=title, id=publication_id, date=date, type=u'annual', subtype=subtype, amendment=amendment, sponsor=sponsor, sponsor_party=sponsor_party, cosponsors=cosponsors, referred=referred, hearings=hearings_held, policy_area=policy_area, html=text_content) return meta
from v2ex.errors import Need2FA, NeedLogin, SigninFailed from v2ex.utils import parse_cookies DEFAULT_HEADERS = { 'referer': 'https://www.v2ex.com/', 'accept-language': 'en,zh;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36' } log = logging.getLogger(__name__) BeautifulSoup = lambda text: _BeautifulSoup(text, features='lxml') def check_session(response): if '你要查看的页面需要先登录' in response.text: raise NeedLogin if '两步验证登录' in response.text: raise Need2FA def logged_in(response): if response.url.path == '/2fa': return False if '确定要从 V2EX 登出?' in response.text: return True return False
def _get_data(self, publication_id): from urllib2 import HTTPError def get_xml(xml_link): try: xml_data = _urlopen(xml_link).read() if 'xml' in xml_data[0:100]: return xml_data else: return None except HTTPError: return None def get_html(html_link): html_response = _urlopen(html_link) html_data = html_response.read() return html_data parl_url = 'http://www.parl.gc.ca' html_base = 'http://www.parl.gc.ca/HousePublications/Publication.aspx?Language=E&Mode=1&DocId={0}&Col=1' html_docs = [] xml_docs = [] initial_html = _urlopen(html_base.format(publication_id)).read() initial_soup = _BeautifulSoup(initial_html) full_doc_links = [ tag for tag in initial_soup.find_all('a') if 'Click here for the entire document' in tag.text ] next_links = [ tag for tag in initial_soup.find_all('a') if 'Next Page' in repr(tag) ] full_link_success = False if len(full_doc_links) == 1: url = parl_url + full_doc_links[0]['href'] try: print 'full link...' print publication_id, url html_local = get_html(url) xml_local = get_xml(url + '&xml=true') html_docs.append(html_local) xml_docs.append(xml_local) full_link_success = True except: pass next_link_success = False if full_link_success is False and len(next_links) > 0: try: while len(next_links) > 0: file_regex = _re.search('File=[0-9]+', next_links[0]['href']) # occasionally pages are malformed with "next" links that don't actually go anywhere if file_regex is not None: url = html_base.format( publication_id) + '&' + file_regex.group(0) print 'next links...' print publication_id, url html_docs.append(get_html(url)) xml_docs.append(get_xml(url + '&xml=true')) next_links = [ tag for tag in _BeautifulSoup( html_docs[-1]).find_all('a') if 'Next Page' in repr(tag) ] else: break next_link_success = True except: pass if full_link_success is False and next_link_success is False: print 'failsafe' print publication_id, html_base xml_docs.append( get_xml(html_base.format(publication_id) + '&xml=true')) html_docs.append(initial_html) self.data[publication_id].update({'html': html_docs, 'xml': xml_docs}) return self.data[publication_id]
def _get_ids(self): """ Note structure here is a little different than later classes - metadata and IDs retrieved at same time. """ def get_meta(bill_content): """ Get metadata search results based on a given URL. """ title = bill_content.BillTitle.find(name='Title', language='en').text if 'amend' in title.lower(): amend = True else: amend = False # rarely, no published version of a bill is available publication_tags = [ t for t in bill_content.find_all('Publication') if t.find(name='Title', language='en').text == 'Royal Assent' ] if len(publication_tags) == 1: publication_id = publication_tags[0]['id'] else: publication_id = None # all other metadata appear to be consistently present date = bill_content.Events.LastMajorStageEvent.Event['date'] session = bill_content.ParliamentSession['parliamentNumber'] subtype = bill_content.BillType.find(name='Title', language='en').text sponsor = bill_content.SponsorAffiliation.Person.FullName.text sponsor_party = bill_content.SponsorAffiliation.PoliticalParty.find( name='Title', language='en').text majority_party = bill_content.PrimeMinister.PoliticalParty.find( name='Title', language='en').text committee_tags = bill_content.find_all(name='Committee', accronym=True) committee_names = [t['accronym'] for t in committee_tags] committee_data = { c: committee_names.count(c) for c in set(committee_names) } metadata = _format_meta_entry(country=u'canada', title=title, id=publication_id, date=date, session=session, type=u'annual', subtype=subtype, amendment=amend, sponsor=sponsor, sponsor_party=sponsor_party, majority_party=majority_party, hearings=committee_data) return metadata base_url = 'http://www.parl.gc.ca{0}' bill_types = [ '/LegisInfo/Result.aspx?BillType=Senate%20Government%20Bill' + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1', '/LegisInfo/Result.aspx?BillType=Private%20Member%E2%80%99s%20Bill' + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1', '/LegisInfo/Result.aspx?BillType=House%20Government%20Bill' + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1', '/LegisInfo/Result.aspx?BillType=Senate%20Public%20Bill' ] searches = [] for bill_type in bill_types: search_content = _BeautifulSoup( _urlopen(base_url.format(bill_type))) sessions = [ _re.sub('&Page=1', '&download=xml', tag['href']) for tag in search_content.find_all('a') if _re.search( '[0-9]{2}-[0-9]\s*\([0-9]+\)', tag.text) is not None ] searches += sessions id_vals = [] for s in searches: url = base_url.format(s) content = _BeautifulSoup(_urlopen(url).read(), features='xml') bills = content.find_all('Bill') for bill in bills: meta = get_meta(bill) if meta['id'] not in self.log_data['Annual']['Canada']: id_vals.append(meta['id']) self.data[meta['id']] = meta return id_vals
def _get_ids(self): # URL corresponding to the following search: # - All congresses from 1989 forward (first date with full bill text and metadata) # - Only legislation that can become law # - Only public bills/laws # - Only actual laws max_attempts = 10 id_vals = [] search_url = 'https://www.congress.gov/advanced-search/legislation?query=%7B%22congresses%22%3A%5B%22114%22%2C'\ '%22113%22%2C%22112%22%2C%22111%22%2C%22110%22%2C%22109%22%2C%22108%22%2C%22107%22%2C%22106%22%2C'\ '%22105%22%2C%22104%22%2C%22103%22%2C%22102%22%2C%22101%22%5D%2C%22restrictionType%22%3A%22field%'\ '22%2C%22restrictionFields%22%3A%5B%22billSummary%22%2C%22allBillTitles%22%5D%2C%22wordVariants%2'\ '2%3A%22true%22%2C%22legislationTypes%22%3A%5B%22hr%22%2C%22hjres%22%2C%22s%22%2C%22sjres%22%5D%2'\ 'C%22legislationScope%22%3A%22Public%22%2C%22legislativeAction%22%3A%22115%22%2C%22legislativeAct'\ 'ionWordVariants%22%3A%22true%22%2C%22sponsorTypes%22%3A%5B%22sponsor%22%2C%22sponsor%22%5D%2C%22'\ 'sponsorTypeBool%22%3A%22Or%22%2C%22committeeBoolType%22%3A%22Or%22%2C%22legislationCanBecomeLaw%'\ '22%3A%22true%22%2C%22sponsorState%22%3A%22One%22%2C%22sourceTab%22%3A%22legislation%22%7D' driver = _webdriver.Firefox() driver.get(search_url) n_results = _Select(driver.find_element_by_name('pageSize')) n_results.select_by_visible_text('250 per page') while True: soup = _BeautifulSoup(driver.page_source) result_tags = [ t for t in soup.find_all('span') if 'class' in t.attrs and 'result-heading' in t['class'] ] result_urls = list(set([t.a['href'] for t in result_tags])) new_ids = [ _re.search('bill/(.*?)\?', url).group(1) for url in result_urls ] new_ids = [_re.sub('/', '_', e) for e in new_ids] print new_ids id_vals += new_ids attempts = 0 while attempts < max_attempts: try: next_button = driver.find_element_by_class_name('next') next_button.click() _sleep(5) break except: print('reattempting pageforward...') _sleep(5) attempts += 1 attempts = 0 closed = False while attempts < max_attempts: print 'checking closure...' try: closure_check = driver.find_element_by_class_name( 'next').get_attribute('outerHTML') if 'next off' in closure_check: closed = True break except: attempts += 1 print('reattempting closure check...') _sleep(5) if closed: break return id_vals