def crawlSingleURL(link, idx, total_links): try: opener = buildOpener() start = time.time() data = opener.open(link).read() soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except socket.timeout: print "ERR: timeout [%s/%s] " % (idx, total_links) except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: pass
def crawlSingleURL(link, idx, total_links): try: opener = buildOpener() start = time.time() data = opener.open(link).read() soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except socket.timeout: print "ERR: timeout [%s/%s] " % (idx, total_links) except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: pass
def test_beautifulsoup__repr_html(self): contents = self.read_test_file('test.html') BeautifulSoup._repr_html_ = render soup = BeautifulSoup(contents) output = soup._repr_html_() self.assertTrue(output) self.assertTrue(isinstance(output, string_representation))
def build_page_info(page_url, data): """ Build page statistics based on beautiful soup invoke, note: this may reload the data content again in order have a fresh start. See http://www.w3schools.com/tags/default.asp for HTML tag references.""" soup = BeautifulSoup(data) stats = PageInfoStats(page_url) for info_tag in KEY_HTML_TAGS: tag_arr = soup.findAll(info_tag) n = len(tag_arr) # Simple switch statement, change handler depending on tag type page_info_switch = { KEY_HTML_TAGS[TAG_a]: lambda x: set_stats_prop(stats, 'anchor_ct', x), KEY_HTML_TAGS[TAG_b]: lambda x: set_stats_prop(stats, 'bold_ct', x), KEY_HTML_TAGS[TAG_bq]: lambda x: set_stats_prop(stats, 'block_ct', x), KEY_HTML_TAGS[TAG_div]: lambda x: set_stats_prop(stats, 'div_ct', x), KEY_HTML_TAGS[TAG_h1]: lambda x: set_stats_prop(stats, 'h1_ct', x), KEY_HTML_TAGS[TAG_h2]: lambda x: set_stats_prop(stats, 'h2_ct', x), KEY_HTML_TAGS[TAG_i]: lambda x: set_stats_prop(stats, 'italic_ct', x), KEY_HTML_TAGS[TAG_img]: lambda x: set_stats_prop(stats, 'img_ct', x), KEY_HTML_TAGS[TAG_p]: lambda x: set_stats_prop(stats, 'para_ct', x), KEY_HTML_TAGS[TAG_span]: lambda x: set_stats_prop(stats, 'span_ct', x), KEY_HTML_TAGS[TAG_strong]: lambda x: set_stats_prop(stats, 'strong_ct', x), KEY_HTML_TAGS[TAG_table]: lambda x: set_stats_prop(stats, 'table_ct', x) } [info_tag](n) return stats
def crawlSingleURLForContent(link, idx, total_links): """ Crawl this URL but only extract the content for content analysis. A more extensive model than crawlSingleURL""" try: opener = buildOpener() start = time.time() data = opener.open(link).read() istats = build_page_info(link, data) data = clean_content(data) soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" # Ignore content we aren't concerned with partial_content = doc_ignore_content(soup) end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.descr = field.tokenizeTags(field.descr) field.keywords = field.tokenizeTags(field.keywords) field.full_content = data field.extract_content = partial_content field.info_stats = istats field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: # NOTE: if pass allowed, compile errors will be ignored. print "ERR<crawlSingleURLForContent>: %s" % e pass
def crawlSingleURLForContent(link, idx, total_links): """ Crawl this URL but only extract the content for content analysis. A more extensive model than crawlSingleURL""" try: opener = buildOpener() start = time.time() data = opener.open(link).read() istats = build_page_info(link, data) data = clean_content(data) soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" # Ignore content we aren't concerned with partial_content = doc_ignore_content(soup) end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.descr = field.tokenizeTags(field.descr) field.keywords = field.tokenizeTags(field.keywords) field.full_content = data field.extract_content = partial_content field.info_stats = istats field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: # NOTE: if pass allowed, compile errors will be ignored. print "ERR<crawlSingleURLForContent>: %s" % e pass
def crawlBuildLinks(link_list): opener = buildOpener() """ Iterate through the list of links and collect links found on each page through the use of the beautiful soup lib.""" total_links = 0 total_links_tag = 0 sub_links = None for link in link_list: try: data = opener.open(link).read() soup = BeautifulSoup(data) sub_links_tag = soup.findAll('a') total_links_tag = total_links_tag + len(sub_links_tag) sub_links = [processSubLink(el) for el in sub_links_tag if validateSubLink(el)] # Filter out duplicates with set sub_links = set(sub_links) total_links = total_links + len(sub_links) except Exception, e: print "ERR <crawlBuildLinks>: %s" % e print " <crawlBuildLinks>: url=[%s]" % link
def extractPageData(opener, url_str): """Request a page through urllib2 libraries, through beautiful soup, extract the page content data including number links, imgs, etc""" req = None cur_time = datetime.datetime.now() status_code_res = 0 model = WebAnalysisModel() try: start = time.clock() req = urllib2.Request(url_str) req.add_header('user-agent', FF_USER_AGENT) data = opener.open(req).read() soup = BeautifulSoup(data) links = soup.findAll('a') imgs = soup.findAll('img') para = soup.findAll('p') meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) keywords_arr = [0, 0] descr_arr = [0, 0] if keywords: keywords_arr[0] = len(keywords) keywords_arr[1] = len(keywords.split(",")) if descr: descr_arr[0] = len(descr) descr_arr[1] = len(descr.split(",")) end = time.clock() response_time = int((end - start) * 1000.0) # Build a web content model model.links_ct = len(links) model.inbound_link_ct = 0 model.outbound_links_ct = 0 model.image_ct = len(imgs) model.meta_keywords_len = keywords_arr[0] model.meta_descr_len = descr_arr[0] model.meta_keywords_wct = keywords_arr[1] model.meta_descr_wct = descr_arr[1] model.para_tag_ct = len(para) model.geo_locations_ct = 0 model.document_size = 0 model.request_time = response_time status_code_res = 200 except urllib2.HTTPError, e: print 'Error status code: ', e.code print "ERR [%s]:scan_url HTTPError: url=%s" % (cur_time, url_str) status_code_res = e.code print e
def build_page_info(page_url, data): """ Build page statistics based on beautiful soup invoke, note: this may reload the data content again in order have a fresh start. See http://www.w3schools.com/tags/default.asp for HTML tag references.""" soup = BeautifulSoup(data) stats = PageInfoStats(page_url) for info_tag in KEY_HTML_TAGS: tag_arr = soup.findAll(info_tag) n = len(tag_arr) # Simple switch statement, change handler depending on tag type page_info_switch = { KEY_HTML_TAGS[TAG_a]: lambda x: set_stats_prop(stats, 'anchor_ct', x), KEY_HTML_TAGS[TAG_b]: lambda x: set_stats_prop(stats, 'bold_ct', x), KEY_HTML_TAGS[TAG_bq]: lambda x: set_stats_prop(stats, 'block_ct', x), KEY_HTML_TAGS[TAG_div]: lambda x: set_stats_prop(stats, 'div_ct', x), KEY_HTML_TAGS[TAG_h1]: lambda x: set_stats_prop(stats, 'h1_ct', x), KEY_HTML_TAGS[TAG_h2]: lambda x: set_stats_prop(stats, 'h2_ct', x), KEY_HTML_TAGS[TAG_i]: lambda x: set_stats_prop(stats, 'italic_ct', x), KEY_HTML_TAGS[TAG_img]: lambda x: set_stats_prop(stats, 'img_ct', x), KEY_HTML_TAGS[TAG_p]: lambda x: set_stats_prop(stats, 'para_ct', x), KEY_HTML_TAGS[TAG_span]: lambda x: set_stats_prop(stats, 'span_ct', x), KEY_HTML_TAGS[TAG_strong]: lambda x: set_stats_prop(stats, 'strong_ct', x), KEY_HTML_TAGS[TAG_table]: lambda x: set_stats_prop(stats, 'table_ct', x) }[info_tag](n) return stats
def crawlBuildLinks(link_list): opener = buildOpener() """ Iterate through the list of links and collect links found on each page through the use of the beautiful soup lib.""" total_links = 0 total_links_tag = 0 sub_links = None for link in link_list: try: data = opener.open(link).read() soup = BeautifulSoup(data) sub_links_tag = soup.findAll('a') total_links_tag = total_links_tag + len(sub_links_tag) sub_links = [ processSubLink(el) for el in sub_links_tag if validateSubLink(el) ] # Filter out duplicates with set sub_links = set(sub_links) total_links = total_links + len(sub_links) except Exception, e: print "ERR <crawlBuildLinks>: %s" % e print " <crawlBuildLinks>: url=[%s]" % link
def formatDescrWithSoup(content): """Format the html descriptions with beautiful soup""" soup = BeautifulSoup(content) res = filterOnlyTextSoup(soup) if (res == None) or (len(res) == 0): # On error or other issues, return empty # TODO: should we return content? return "" else: try: res = formatHtmlEntities(res) return res.strip() except Exception, e: print e return ""
def extractPageData(opener, url_str): """Request a page through urllib2 libraries, through beautiful soup, extract the page content data including number links, imgs, etc""" req = None cur_time = datetime.datetime.now() status_code_res = 0 model = WebAnalysisModel() try: start = time.clock() req = urllib2.Request(url_str) req.add_header('user-agent', FF_USER_AGENT) data = opener.open(req).read() soup = BeautifulSoup(data) links = soup.findAll('a') imgs = soup.findAll('img') para = soup.findAll('p') meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) keywords_arr = [0, 0] descr_arr = [0, 0] if keywords: keywords_arr[0] = len(keywords) keywords_arr[1] = len(keywords.split(",")) if descr: descr_arr[0] = len(descr) descr_arr[1] = len(descr.split(",")) end = time.clock() response_time = int((end - start) * 1000.0) # Build a web content model model.links_ct = len(links) model.inbound_link_ct = 0 model.outbound_links_ct = 0 model.image_ct = len(imgs) model.meta_keywords_len = keywords_arr[0] model.meta_descr_len = descr_arr[0] model.meta_keywords_wct = keywords_arr[1] model.meta_descr_wct = descr_arr[1] model.para_tag_ct = len(para) model.geo_locations_ct = 0 model.document_size = 0 model.request_time = response_time status_code_res = 200 except urllib2.HTTPError, e: print 'Error status code: ', e.code print "ERR [%s]:scan_url HTTPError: url=%s" % (cur_time, url_str) status_code_res = e.code print e
def __init__(self, html, type=NODE, **kwargs): """ The base class for Text, Comment and Element. All DOM nodes can be navigated in the same way (e.g. Node.parent, Node.children, ...) """ self.type = type self._p = not isinstance(html, SOUP) and BeautifulSoup.BeautifulSoup(u(html), **kwargs) or html