class WebpageSummarizer(object): """ Generates summary of a given web page. """ def __init__(self): self.browser = StatefulBrowser(user_agent='Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0') self.browser.session.headers.update({'Upgrade-Insecure-Requests': '1'}) def summarize_webpage(self, url, summarization_ratio): """ Takes a web page URL and returns the title and a summary of the web page. :param url: Web page URL. :param summarization_ratio: Fraction of original text to include in the summary. :return: Web page title and summarized web page text. """ title = summarized_text = '' try: self.browser.open(url) page = self.browser.get_current_page() # Find all the paragraphs because they contain the main web page text page_text = ' '.join(map(lambda p: p.text, page.find_all('p'))) title = page.title.text.strip() # Generate a summary of the given web page text if it contains more than 10 sentences summarized_text = summarize(page_text, ratio=summarization_ratio).strip() if summarized_text == '': summarized_text = page_text except Exception as e: print(e) finally: self.browser.close() return title, summarized_text
def scrape_HTML(url): """Scrapes the HTML from W4MPJobs""" browser = StatefulBrowser() page = browser.open(url) form = Form(page.soup.form) # Selects all on the number of results radio button number_results_data = {"ctl00$MainContent$RadioButtonList2": 9999} form.set_radio(number_results_data) # Selects NWM or more on salary radio button salary_data = {"ctl00$MainContent$rblSalary": "nmwormore"} form.set_radio(salary_data) # Selects outside London on the location radio button – other options commented out location_data = {"ctl00$MainContent$rblJobs": "outside"} # location_data = {"ctl00$MainContent$rblJobs": "inlondon"} # location_data = {"ctl00$MainContent$rblJobs": "both"} form.set_radio(location_data) # Submits the form response = browser.submit(form, page.url) # Gets response as text response = response.text # Closes the browser browser.close() return response
class Session: BASE_URL = 'https://m.facebook.com' def __init__(self, browser_wrapper): self._connected = False self._current_html = None self._browser_wrapper = browser_wrapper self._browser = StatefulBrowser() self._browser.addHeaders = [('User-Agent', 'Firefox'), ('Accept-Language', 'en-US,en;q=0.5')] def __del__(self): self._dispose() def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self._dispose() @property def connected(self): return self._connected def log_in(self, username, password): try: # Log in to non-mobile site is more reliable self._browser_wrapper.open(self._browser, 'https://www.facebook.com') self._browser.select_form('form[id="login_form"]') self._browser['email'] = username self._browser['pass'] = password self._browser_wrapper.submit_selected(self._browser) # Check if we really are in account profile page if self._browser.get_current_page().find('form', action='/search/top/'): self._connected = True except: raise LogInError(f'Unable to log in as {username}') return self def log_out(self): if self._connected: self._browser.close() self._connected = False def profile_info(self, id_): """Retrieve informations for a given profile.""" self._ensure_connected() try: self._browser_wrapper.open(self._browser, f'{Session.BASE_URL}/{id_}') name = self._sanitize_title( self._browser.get_current_page().find('title').text) image = parse_image(self._browser.get_current_page(), name) info = parse_info(self._browser.get_current_page()) return name, image, info except: return None def search(self, query): """ Execute search of a given text returning a tuple with ID, descriptions and URI. """ url_query = '+'.join(query.split()) url_path = f'/search/top/?q={url_query}' \ if self._connected else f'/public/{url_query}' try: self._browser_wrapper.open( self._browser, f'{Session.BASE_URL}{url_path}{url_query}') return parse_search(self._browser.get_current_page(), Session.BASE_URL) except: return None def _ensure_connected(self): if not self._connected: raise NotConnectedError('No active connection or required login') def _sanitize_title(self, title): # Handle cases like 'Some One - Home' if '-' in title: return title.split('-')[0].strip() return title def _dispose(self): if self._connected: self.log_out()
from mechanicalsoup import StatefulBrowser browser = StatefulBrowser() browser.open("http://www.python.org/") browser.follow_link("/blogs/") #browser.follow_link(text="Python News") print(browser.get_url()) browser.select_form() browser.get_current_form().print_summary() browser["q"] = "Raymond Hettinger" browser.submit_selected() print(browser.get_url()) print("-" * 40) print(browser.links()) browser.close()