def get_article(link): # link = webbrowser article_text = requests.get(str(link), "utf8") if article_text is not None: html = bs4.BeautifulSoup( article_text.text, "lxml", ) title = html.select("#firstHeading")[0].text paragraphs = html.select("p") for para in paragraphs: article = '\n'.join([para.text for para in paragraphs]) # removing the footnote article = (re.sub('\[\d+\]', '', article.strip())) return article
def _extract_elements(html, selector): """ Extracts elements by selector. Can return string o list """ attr = "" if "@" in selector: selector, attr = selector.split("@") elements = html.select(selector) if type(elements) is list: if len(elements) > 1: logger.info("List result: %s", selector,) return [_get_value(e, attr) for e in elements] elif len(elements) == 1: logger.info("Single result: %s", selector,) return _get_value(elements[0], attr) logger.info("Failed to extract: %s", selector,) return ""
def scrape_page(url, selector): """ Scrapes an url yelding as output each link url matching selector """ html = parse_markup_in_url(url) if not html: logger.warning("Empty html for: %s" % url) return [] investment_links = html.select(selector) if not investment_links: logger.warning("No links for: %s" % url) return [] for investment_link in investment_links: if not investment_link: logger.warning("Empty investment links in page %s" % url) continue investment_url = investment_link.get("href", "") if not investment_url: logger.warning("Empty investment link") continue yield investment_url
def search_for_word(self, word, depth="shallow"): """Take a japanese word and spit out well-formatted dictionaries for each entry. """ # self._get_search_response(word) self._extract_html(uri_for_search(word)) results = self.html.select(".concept_light.clearfix") # print(results) fmtd_results = [] if depth == "shallow": for r in results: fmtd_results.append(self._extract_dictionary_information(r)) elif depth == "deep": for r in results: fmtd_results.append(self._extract_dictionary_information(r)) # If there are more than 20 results on the page, there is no "More Words" link more = self.html.select_one(".more") while more: link = more.get("href") response = requests.get(r"http:" + link, timeout=5) html = BeautifulSoup(response.content, "html.parser") results = html.select(".concept_light.clearfix") for r in results: fmtd_results.append( self._extract_dictionary_information(r)) more = html.select_one(".more") return fmtd_results
def check_html(self): #@TODO: make this its own test suite! # this shouldn't crash on empty options list: html.select("box", [])
error('{0:horrific_exception}<br /><b>{0:message}:</b> {1}'.format(L, exc)) if debug and post.getvalue('debug'): raise # Call generator generate() # Select options operatorLevels = [str_.format(L) for str_ in ( '1. {:basic} (+, -, *, /)', '2. {:advanced} (i++, ++i, +=, *=, ...)', '3. {:bitwise_operators} (&, |, ^, %, ...)', '4. {:bit_shifts} (<<, >>, ...)')] pointerLevels = [str_.format(L) for str_ in ('{:none}', '{:single_references}', '{:multiple_references}')] functionLevels = [str_.format(L) for str_ in ('{:none}', '{:by_value}', '{:by_reference}')] # Build selects form['operators'] = html.select('operators', operatorLevels, selected=post.getvalue('operators')) form['pointers'] = html.select('pointers', pointerLevels, selected=post.getvalue('pointers')) form['functions'] = html.select('functions', functionLevels, selected=post.getvalue('functions')) # Build Checkboxes form['identifiers'] = '<br />'.join([ html.checkbox('void', '{:void_functions}'.format(L), checked=(default or post.getbool('void'))), html.checkbox('float', '{:floating_point}'.format(L), checked=post.getbool('float')), html.checkbox('arrays', '{:arrays}'.format(L), disabled=True), html.checkbox('strings', '{:strings}'.format(L), disabled=True) ]) form['additionals'] = '<br />'.join([ html.checkbox('conditionals', '{:conditional_statements}'.format(L), checked=post.getbool('conditionals')), html.checkbox('loops', '{:loops}'.format(L), disabled=True) ]) form['toggleDebug'] = '''