Ejemplo n.º 1
0
def crawlSingleURL(link, idx, total_links):
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)
        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        end = time.time()

        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))
        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except socket.timeout:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        pass
Ejemplo n.º 2
0
def crawlSingleURL(link, idx, total_links):
	try:
		opener = buildOpener()
		start = time.time()
		data = opener.open(link).read()
		soup = BeautifulSoup(data)
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})
		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)
		# Extract the title tag
		titleTag = None
		try:
			titleTag = soup.html.head.title
			titleTag = str(titleTag.string)
		except:
			titleTag = ""			
		end = time.time()

		# Return the basic URL data structure
		field = URLField(link, titleTag, descr, keywords)
		field.populate()	
		if ((idx % LINK_SET_INDICATOR) == 0):			
			sys.stdout.write("[%s/%s] " % (idx, total_links))
		# Exit crawl single URL with url field.
		# @return URLField
		return field
	except socket.timeout:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except urllib2.URLError:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except Exception, e:
		pass
Ejemplo n.º 3
0
 def test_beautifulsoup__repr_html(self):
     contents = self.read_test_file('test.html')
     BeautifulSoup._repr_html_ = render
     soup = BeautifulSoup(contents)
     output = soup._repr_html_()
     self.assertTrue(output)
     self.assertTrue(isinstance(output, string_representation))
def build_page_info(page_url, data):
	""" Build page statistics based on beautiful soup invoke,
	note: this may reload the data content again in order have a fresh start.
	See http://www.w3schools.com/tags/default.asp 
	for HTML tag references."""
	soup = BeautifulSoup(data)
	stats = PageInfoStats(page_url)
	for info_tag in KEY_HTML_TAGS:
		tag_arr = soup.findAll(info_tag)
		n = len(tag_arr)
		# Simple switch statement, change handler depending on tag type
		page_info_switch = {
			KEY_HTML_TAGS[TAG_a]: lambda x: set_stats_prop(stats, 'anchor_ct', x),
			KEY_HTML_TAGS[TAG_b]: lambda x: set_stats_prop(stats, 'bold_ct', x), 
			KEY_HTML_TAGS[TAG_bq]: lambda x: set_stats_prop(stats, 'block_ct', x), 
			KEY_HTML_TAGS[TAG_div]: lambda x: set_stats_prop(stats, 'div_ct', x), 
			KEY_HTML_TAGS[TAG_h1]: lambda x: set_stats_prop(stats, 'h1_ct', x), 
			KEY_HTML_TAGS[TAG_h2]: lambda x: set_stats_prop(stats, 'h2_ct', x), 
			KEY_HTML_TAGS[TAG_i]: lambda x: set_stats_prop(stats, 'italic_ct', x), 
			KEY_HTML_TAGS[TAG_img]: lambda x: set_stats_prop(stats, 'img_ct', x), 
			KEY_HTML_TAGS[TAG_p]: lambda x: set_stats_prop(stats, 'para_ct', x), 
			KEY_HTML_TAGS[TAG_span]: lambda x: set_stats_prop(stats, 'span_ct', x), 
			KEY_HTML_TAGS[TAG_strong]: lambda x: set_stats_prop(stats, 'strong_ct', x), 
			KEY_HTML_TAGS[TAG_table]: lambda x: set_stats_prop(stats, 'table_ct', x)
			} [info_tag](n)
	return stats
Ejemplo n.º 5
0
 def test_beautifulsoup__repr_html(self):
     contents = self.read_test_file('test.html')
     BeautifulSoup._repr_html_ = render
     soup = BeautifulSoup(contents)
     output = soup._repr_html_()
     self.assertTrue(output)
     self.assertTrue(isinstance(output, string_representation))
Ejemplo n.º 6
0
def crawlSingleURLForContent(link, idx, total_links):
    """ Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
    try:
        opener = buildOpener()
        start = time.time()
        data = opener.open(link).read()
        istats = build_page_info(link, data)
        data = clean_content(data)
        soup = BeautifulSoup(data)
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})
        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)

        # Extract the title tag
        titleTag = None
        try:
            titleTag = soup.html.head.title
            titleTag = str(titleTag.string)
        except:
            titleTag = ""
        # Ignore content we aren't concerned with
        partial_content = doc_ignore_content(soup)

        end = time.time()
        # Return the basic URL data structure
        field = URLField(link, titleTag, descr, keywords)

        field.descr = field.tokenizeTags(field.descr)
        field.keywords = field.tokenizeTags(field.keywords)

        field.full_content = data
        field.extract_content = partial_content
        field.info_stats = istats
        field.populate()
        if ((idx % LINK_SET_INDICATOR) == 0):
            sys.stdout.write("[%s/%s] " % (idx, total_links))

        # Exit crawl single URL with url field.
        # @return URLField
        return field
    except urllib2.URLError:
        print "ERR: timeout [%s/%s] " % (idx, total_links)
    except Exception, e:
        # NOTE: if pass allowed, compile errors will be ignored.
        print "ERR<crawlSingleURLForContent>: %s" % e
        pass
Ejemplo n.º 7
0
def crawlSingleURLForContent(link, idx, total_links):
	""" Crawl this URL but only extract the content for content
	analysis.  A more extensive model than crawlSingleURL"""
	try:
		opener = buildOpener()
		start = time.time()
		data = opener.open(link).read()
		istats = build_page_info(link, data)
		data = clean_content(data)
		soup = BeautifulSoup(data)
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})
		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)

		# Extract the title tag
		titleTag = None
		try:
			titleTag = soup.html.head.title
			titleTag = str(titleTag.string)
		except:
			titleTag = ""
		# Ignore content we aren't concerned with
		partial_content = doc_ignore_content(soup)
		
		end = time.time()
		# Return the basic URL data structure
		field = URLField(link, titleTag, descr, keywords)

		field.descr = field.tokenizeTags(field.descr)
		field.keywords = field.tokenizeTags(field.keywords)

		field.full_content = data
		field.extract_content = partial_content
		field.info_stats = istats
		field.populate()
		if ((idx % LINK_SET_INDICATOR) == 0):
			sys.stdout.write("[%s/%s] " % (idx, total_links))
	   		
		# Exit crawl single URL with url field.
		# @return URLField
		return field
	except urllib2.URLError:
		print "ERR: timeout [%s/%s] " % (idx, total_links)
	except Exception, e:
		# NOTE: if pass allowed, compile errors will be ignored.
		print "ERR<crawlSingleURLForContent>: %s" % e
		pass
Ejemplo n.º 8
0
def crawlBuildLinks(link_list):
	opener = buildOpener()
	""" Iterate through the list of links and collect links found
	on each page through the use of the beautiful soup lib."""
	total_links = 0
	total_links_tag = 0
	sub_links = None
	for link in link_list:
		try:
			data = opener.open(link).read()
			soup = BeautifulSoup(data)
			sub_links_tag = soup.findAll('a')
			total_links_tag = total_links_tag + len(sub_links_tag)
			sub_links = [processSubLink(el) for el in sub_links_tag if validateSubLink(el)]			
			# Filter out duplicates with set
			sub_links = set(sub_links)		
			total_links = total_links + len(sub_links)
		except Exception, e:
			print "ERR <crawlBuildLinks>: %s" % e
			print "    <crawlBuildLinks>: url=[%s]" % link
Ejemplo n.º 9
0
def extractPageData(opener, url_str):
    """Request a page through urllib2 libraries, through beautiful soup,
	extract the page content data including number links, imgs, etc"""
    req = None
    cur_time = datetime.datetime.now()
    status_code_res = 0
    model = WebAnalysisModel()
    try:
        start = time.clock()
        req = urllib2.Request(url_str)
        req.add_header('user-agent', FF_USER_AGENT)
        data = opener.open(req).read()
        soup = BeautifulSoup(data)

        links = soup.findAll('a')
        imgs = soup.findAll('img')
        para = soup.findAll('p')
        meta_data_keywords = soup.findAll('meta', {'name': 'keywords'})
        meta_data_descr = soup.findAll('meta', {'name': 'description'})

        keywords = get_meta_content(meta_data_keywords)
        descr = get_meta_content(meta_data_descr)
        keywords_arr = [0, 0]
        descr_arr = [0, 0]
        if keywords:
            keywords_arr[0] = len(keywords)
            keywords_arr[1] = len(keywords.split(","))
        if descr:
            descr_arr[0] = len(descr)
            descr_arr[1] = len(descr.split(","))

        end = time.clock()
        response_time = int((end - start) * 1000.0)

        # Build a web content model
        model.links_ct = len(links)
        model.inbound_link_ct = 0
        model.outbound_links_ct = 0
        model.image_ct = len(imgs)
        model.meta_keywords_len = keywords_arr[0]
        model.meta_descr_len = descr_arr[0]
        model.meta_keywords_wct = keywords_arr[1]
        model.meta_descr_wct = descr_arr[1]
        model.para_tag_ct = len(para)
        model.geo_locations_ct = 0
        model.document_size = 0
        model.request_time = response_time
        status_code_res = 200
    except urllib2.HTTPError, e:
        print 'Error status code: ', e.code
        print "ERR [%s]:scan_url HTTPError: url=%s" % (cur_time, url_str)
        status_code_res = e.code
        print e
Ejemplo n.º 10
0
def build_page_info(page_url, data):
    """ Build page statistics based on beautiful soup invoke,
	note: this may reload the data content again in order have a fresh start.
	See http://www.w3schools.com/tags/default.asp 
	for HTML tag references."""
    soup = BeautifulSoup(data)
    stats = PageInfoStats(page_url)
    for info_tag in KEY_HTML_TAGS:
        tag_arr = soup.findAll(info_tag)
        n = len(tag_arr)
        # Simple switch statement, change handler depending on tag type
        page_info_switch = {
            KEY_HTML_TAGS[TAG_a]:
            lambda x: set_stats_prop(stats, 'anchor_ct', x),
            KEY_HTML_TAGS[TAG_b]:
            lambda x: set_stats_prop(stats, 'bold_ct', x),
            KEY_HTML_TAGS[TAG_bq]:
            lambda x: set_stats_prop(stats, 'block_ct', x),
            KEY_HTML_TAGS[TAG_div]:
            lambda x: set_stats_prop(stats, 'div_ct', x),
            KEY_HTML_TAGS[TAG_h1]:
            lambda x: set_stats_prop(stats, 'h1_ct', x),
            KEY_HTML_TAGS[TAG_h2]:
            lambda x: set_stats_prop(stats, 'h2_ct', x),
            KEY_HTML_TAGS[TAG_i]:
            lambda x: set_stats_prop(stats, 'italic_ct', x),
            KEY_HTML_TAGS[TAG_img]:
            lambda x: set_stats_prop(stats, 'img_ct', x),
            KEY_HTML_TAGS[TAG_p]:
            lambda x: set_stats_prop(stats, 'para_ct', x),
            KEY_HTML_TAGS[TAG_span]:
            lambda x: set_stats_prop(stats, 'span_ct', x),
            KEY_HTML_TAGS[TAG_strong]:
            lambda x: set_stats_prop(stats, 'strong_ct', x),
            KEY_HTML_TAGS[TAG_table]:
            lambda x: set_stats_prop(stats, 'table_ct', x)
        }[info_tag](n)
    return stats
Ejemplo n.º 11
0
def crawlBuildLinks(link_list):
    opener = buildOpener()
    """ Iterate through the list of links and collect links found
	on each page through the use of the beautiful soup lib."""
    total_links = 0
    total_links_tag = 0
    sub_links = None
    for link in link_list:
        try:
            data = opener.open(link).read()
            soup = BeautifulSoup(data)
            sub_links_tag = soup.findAll('a')
            total_links_tag = total_links_tag + len(sub_links_tag)
            sub_links = [
                processSubLink(el) for el in sub_links_tag
                if validateSubLink(el)
            ]
            # Filter out duplicates with set
            sub_links = set(sub_links)
            total_links = total_links + len(sub_links)
        except Exception, e:
            print "ERR <crawlBuildLinks>: %s" % e
            print "    <crawlBuildLinks>: url=[%s]" % link
Ejemplo n.º 12
0
def formatDescrWithSoup(content):
    """Format the html descriptions with beautiful soup"""
    soup = BeautifulSoup(content)
    res = filterOnlyTextSoup(soup)
    if (res == None) or (len(res) == 0):
        # On error or other issues, return empty
        # TODO: should we return content?
        return ""
    else:
        try:
            res = formatHtmlEntities(res)
            return res.strip()
        except Exception, e:
            print e
            return ""
Ejemplo n.º 13
0
def extractPageData(opener, url_str):
	"""Request a page through urllib2 libraries, through beautiful soup,
	extract the page content data including number links, imgs, etc"""
	req = None
	cur_time = datetime.datetime.now()
	status_code_res = 0
	model = WebAnalysisModel()
	try:
		start = time.clock()
		req = urllib2.Request(url_str)
		req.add_header('user-agent', FF_USER_AGENT)		
		data = opener.open(req).read()
		soup = BeautifulSoup(data)
		
		links = soup.findAll('a')
		imgs = soup.findAll('img')
		para = soup.findAll('p')
		meta_data_keywords = soup.findAll('meta', {'name':'keywords'})
		meta_data_descr = soup.findAll('meta', {'name':'description'})

		keywords = get_meta_content(meta_data_keywords)
		descr = get_meta_content(meta_data_descr)
		keywords_arr = [0, 0]
		descr_arr = [0, 0]
		if keywords:
			keywords_arr[0] = len(keywords)
			keywords_arr[1] = len(keywords.split(","))
		if descr:
			descr_arr[0] = len(descr)
			descr_arr[1] = len(descr.split(","))
		
		end = time.clock()
		response_time = int((end - start) * 1000.0)

		# Build a web content model
		model.links_ct = len(links)
		model.inbound_link_ct = 0
		model.outbound_links_ct = 0 
		model.image_ct = len(imgs)
		model.meta_keywords_len = keywords_arr[0]
		model.meta_descr_len = descr_arr[0]
		model.meta_keywords_wct = keywords_arr[1]
		model.meta_descr_wct = descr_arr[1]
		model.para_tag_ct = len(para)
		model.geo_locations_ct = 0 
		model.document_size = 0
		model.request_time = response_time
		status_code_res = 200
	except urllib2.HTTPError, e:
		print 'Error status code: ', e.code
		print "ERR [%s]:scan_url HTTPError: url=%s" % (cur_time, url_str)
		status_code_res = e.code
		print e
Ejemplo n.º 14
0
 def __init__(self, html, type=NODE, **kwargs):
     """ The base class for Text, Comment and Element.
         All DOM nodes can be navigated in the same way (e.g. Node.parent, Node.children, ...)
     """
     self.type = type
     self._p = not isinstance(html, SOUP) and BeautifulSoup.BeautifulSoup(u(html), **kwargs) or html