Beispiel #1
0
def get_article(link):
    # link = webbrowser
    article_text = requests.get(str(link), "utf8")
    if article_text is not None:
        html = bs4.BeautifulSoup(
            article_text.text,
            "lxml",
        )
        title = html.select("#firstHeading")[0].text
        paragraphs = html.select("p")
        for para in paragraphs:
            article = '\n'.join([para.text for para in paragraphs])
            # removing the footnote
            article = (re.sub('\[\d+\]', '', article.strip()))
        return article
Beispiel #2
0
def _extract_elements(html, selector):
    """ Extracts elements by selector. Can return string o list
    """
    attr = ""
    if "@" in selector:
        selector, attr = selector.split("@")
    elements = html.select(selector)
    if type(elements) is list:
        if len(elements) > 1:
            logger.info("List result: %s", selector,)
            return [_get_value(e, attr) for e in elements]
        elif len(elements) == 1:
            logger.info("Single result: %s", selector,)
            return _get_value(elements[0], attr)
    logger.info("Failed to extract: %s", selector,)
    return ""
Beispiel #3
0
def scrape_page(url, selector):
    """ Scrapes an url yelding as output each link url matching selector
    """
    html = parse_markup_in_url(url)
    if not html:
        logger.warning("Empty html for: %s" % url)
        return []
    investment_links = html.select(selector)
    if not investment_links:
        logger.warning("No links for: %s" % url)
        return []
    for investment_link in investment_links:
        if not investment_link:
            logger.warning("Empty investment links in page %s" % url)
            continue
        investment_url = investment_link.get("href", "")
        if not investment_url:
            logger.warning("Empty investment link")
            continue
        yield investment_url
Beispiel #4
0
    def search_for_word(self, word, depth="shallow"):
        """Take a japanese word and spit out well-formatted dictionaries for each entry.
        
        """

        # self._get_search_response(word)
        self._extract_html(uri_for_search(word))

        results = self.html.select(".concept_light.clearfix")
        # print(results)
        fmtd_results = []

        if depth == "shallow":
            for r in results:
                fmtd_results.append(self._extract_dictionary_information(r))

        elif depth == "deep":

            for r in results:
                fmtd_results.append(self._extract_dictionary_information(r))

                # If there are more than 20 results on the page, there is no "More Words" link
            more = self.html.select_one(".more")

            while more:
                link = more.get("href")
                response = requests.get(r"http:" + link, timeout=5)
                html = BeautifulSoup(response.content, "html.parser")
                results = html.select(".concept_light.clearfix")

                for r in results:
                    fmtd_results.append(
                        self._extract_dictionary_information(r))

                more = html.select_one(".more")

        return fmtd_results
Beispiel #5
0
 def check_html(self):
     #@TODO: make this its own test suite!
     # this shouldn't crash on empty options list:
     html.select("box", [])
Beispiel #6
0
 def check_html(self):
     #@TODO: make this its own test suite!
     # this shouldn't crash on empty options list:
     html.select("box", [])
Beispiel #7
0
                error('{0:horrific_exception}<br /><b>{0:message}:</b> {1}'.format(L, exc))
                if debug and post.getvalue('debug'):
                    raise

    # Call generator
    generate()

# Select options
operatorLevels = [str_.format(L) for str_ in (
    '1. {:basic} (+, -, *, /)', '2. {:advanced} (i++, ++i, +=, *=, ...)',
    '3. {:bitwise_operators} (&, |, ^, %, ...)', '4. {:bit_shifts} (<<, >>, ...)')]
pointerLevels = [str_.format(L) for str_ in ('{:none}', '{:single_references}', '{:multiple_references}')]
functionLevels = [str_.format(L) for str_ in ('{:none}', '{:by_value}', '{:by_reference}')]

# Build selects
form['operators'] = html.select('operators', operatorLevels, selected=post.getvalue('operators'))
form['pointers'] = html.select('pointers', pointerLevels, selected=post.getvalue('pointers'))
form['functions'] = html.select('functions', functionLevels, selected=post.getvalue('functions'))

# Build Checkboxes
form['identifiers'] = '<br />'.join([
    html.checkbox('void', '{:void_functions}'.format(L), checked=(default or post.getbool('void'))),
    html.checkbox('float', '{:floating_point}'.format(L), checked=post.getbool('float')),
    html.checkbox('arrays', '{:arrays}'.format(L), disabled=True),
    html.checkbox('strings', '{:strings}'.format(L), disabled=True)
])
form['additionals'] = '<br />'.join([
    html.checkbox('conditionals', '{:conditional_statements}'.format(L), checked=post.getbool('conditionals')),
    html.checkbox('loops', '{:loops}'.format(L), disabled=True)
])
form['toggleDebug'] = '''