def get_raw_entities(self):
     entities = []
     items = self.root.xpath('.//h2[@class="accordion-title" and contains(., "At a Glance")]/following-sibling::div//p')
     for item in items:
         num_stars = len(item.text.strip())
         starred = num_stars == 3
         name = item.xpath('.//strong')[0].text.strip()
         temp_html = re.sub('<strong>.*</strong>', 'SPLIT_POINT', etree.tostring(item))
         temp_node = html_parsing.parse_tree_from_string(temp_html.encode('utf-8'))
         desc = html_parsing.tostring(temp_node).split('SPLIT_POINT')[1].strip()
         entities.append(data.Entity(name=name, starred=starred, description=desc))
     return entities
Ejemplo n.º 2
0
def build_scrapers(url, client_page_source=None, force_fetch_page=False, allow_expansion=True, for_guide=False):
    page_source_tree = html_parsing.parse_tree_from_string(client_page_source) if client_page_source else None
    if not page_source_tree and (url_requires_server_page_source(url) or force_fetch_page):
        page_source_tree = html_parsing.parse_tree(url)

    scraped_pages = []
    for scraper_class in ALL_SCRAPERS:
        handleable_urls = scraper_class.handleable_urls(url, page_source_tree, allow_expansion)
        if handleable_urls:
            reqs = [html_parsing.make_request(u) for u in handleable_urls]
            resps = utils.parallelize(utils.retryable(urllib2.urlopen, 3), [(req,) for req in reqs])
            for url, resp in zip(handleable_urls, resps):
                if not resp:
                    print "Failed to fetch url: %s" % url
                    continue
                tree = etree.parse(resp, html_parsing.htmlparser())
                scraper = scraper_class(url, tree, for_guide)
                scraped_pages.append(scraper)
            break
    return scraped_pages
Ejemplo n.º 3
0
def extract_urls_from_page_source(url, page_source):
    urls = []
    tree = html_parsing.parse_tree_from_string(page_source)
    urls.extend(extract_all_links_from_anchors(url, tree))
    urls.extend(extract_all_links_from_text(html_parsing.tostring(tree.getroot())))
    return urls