def crawlSingleURL(link, idx, total_links): try: opener = buildOpener() start = time.time() data = opener.open(link).read() soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except socket.timeout: print "ERR: timeout [%s/%s] " % (idx, total_links) except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: pass
def crawlSingleURL(link, idx, total_links): try: opener = buildOpener() start = time.time() data = opener.open(link).read() soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except socket.timeout: print "ERR: timeout [%s/%s] " % (idx, total_links) except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: pass
def connectLinkService(requrl): """ First, connect to the botlist URL service and extract the most recent list of links. This will seed the botlist spider crawler.""" opener = buildOpener() req = urllib2.Request(requrl) req.add_header('user-agent', FF_USER_AGENT) link_data = opener.open(req).read() link_data = [ line.strip() for line in link_data.split('\n') ] link_data = filter(lambda (line): (len(line) > 0) and (len(line.split('::|')) == NO_COLS_SERVICE), link_data) content = [ col.split('::|') for col in link_data ] return content
def connectLinkService(requrl): """ First, connect to the botlist URL service and extract the most recent list of links. This will seed the botlist spider crawler.""" opener = buildOpener() req = urllib2.Request(requrl) req.add_header('user-agent', FF_USER_AGENT) link_data = opener.open(req).read() link_data = [line.strip() for line in link_data.split('\n')] link_data = filter( lambda (line): (len(line) > 0) and (len(line.split('::|')) == NO_COLS_SERVICE), link_data) content = [col.split('::|') for col in link_data] return content
def crawlSingleURLForContent(link, idx, total_links): """ Crawl this URL but only extract the content for content analysis. A more extensive model than crawlSingleURL""" try: opener = buildOpener() start = time.time() data = opener.open(link).read() istats = build_page_info(link, data) data = clean_content(data) soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name': 'keywords'}) meta_data_descr = soup.findAll('meta', {'name': 'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" # Ignore content we aren't concerned with partial_content = doc_ignore_content(soup) end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.descr = field.tokenizeTags(field.descr) field.keywords = field.tokenizeTags(field.keywords) field.full_content = data field.extract_content = partial_content field.info_stats = istats field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: # NOTE: if pass allowed, compile errors will be ignored. print "ERR<crawlSingleURLForContent>: %s" % e pass
def crawlSingleURLForContent(link, idx, total_links): """ Crawl this URL but only extract the content for content analysis. A more extensive model than crawlSingleURL""" try: opener = buildOpener() start = time.time() data = opener.open(link).read() istats = build_page_info(link, data) data = clean_content(data) soup = BeautifulSoup(data) meta_data_keywords = soup.findAll('meta', {'name':'keywords'}) meta_data_descr = soup.findAll('meta', {'name':'description'}) keywords = get_meta_content(meta_data_keywords) descr = get_meta_content(meta_data_descr) # Extract the title tag titleTag = None try: titleTag = soup.html.head.title titleTag = str(titleTag.string) except: titleTag = "" # Ignore content we aren't concerned with partial_content = doc_ignore_content(soup) end = time.time() # Return the basic URL data structure field = URLField(link, titleTag, descr, keywords) field.descr = field.tokenizeTags(field.descr) field.keywords = field.tokenizeTags(field.keywords) field.full_content = data field.extract_content = partial_content field.info_stats = istats field.populate() if ((idx % LINK_SET_INDICATOR) == 0): sys.stdout.write("[%s/%s] " % (idx, total_links)) # Exit crawl single URL with url field. # @return URLField return field except urllib2.URLError: print "ERR: timeout [%s/%s] " % (idx, total_links) except Exception, e: # NOTE: if pass allowed, compile errors will be ignored. print "ERR<crawlSingleURLForContent>: %s" % e pass
def crawlBuildLinks(link_list): opener = buildOpener() """ Iterate through the list of links and collect links found on each page through the use of the beautiful soup lib.""" total_links = 0 total_links_tag = 0 sub_links = None for link in link_list: try: data = opener.open(link).read() soup = BeautifulSoup(data) sub_links_tag = soup.findAll('a') total_links_tag = total_links_tag + len(sub_links_tag) sub_links = [processSubLink(el) for el in sub_links_tag if validateSubLink(el)] # Filter out duplicates with set sub_links = set(sub_links) total_links = total_links + len(sub_links) except Exception, e: print "ERR <crawlBuildLinks>: %s" % e print " <crawlBuildLinks>: url=[%s]" % link
def crawlBuildLinks(link_list): opener = buildOpener() """ Iterate through the list of links and collect links found on each page through the use of the beautiful soup lib.""" total_links = 0 total_links_tag = 0 sub_links = None for link in link_list: try: data = opener.open(link).read() soup = BeautifulSoup(data) sub_links_tag = soup.findAll('a') total_links_tag = total_links_tag + len(sub_links_tag) sub_links = [ processSubLink(el) for el in sub_links_tag if validateSubLink(el) ] # Filter out duplicates with set sub_links = set(sub_links) total_links = total_links + len(sub_links) except Exception, e: print "ERR <crawlBuildLinks>: %s" % e print " <crawlBuildLinks>: url=[%s]" % link