Beispiel #1
0
 def is_page_robot_scannable(self):
     """
     Returns a boolean that tells whether the page is robot scrapeable.
     """
     robotcheck = RobotFileParser()
     robotcheck.set_url(self.urlparse[0]+'://'+self.urlparse[1]+'/robots.txt')
     robotcheck.read()
     return robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
Beispiel #2
0
 def test_parse(self):
     from robotparser import RobotFileParser
     rules = RobotFileParser()
     rules.set_url("http://www.sogou.com/robots.txt")
     rules.read()
     self.assertEqual(
         rules.can_fetch("mozilla", "http://www.sogou.com/sohu/robots.txt"),
         False)
Beispiel #3
0
 def _allowed_to_open(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     try:
         rp.read()
     except:
         return False
     return rp.can_fetch(self._agent_name, url)
def get_robots(url):
    '''
    Initialize robots parser for this domain
    :param url:
    :return:
    '''
    rp = RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp
Beispiel #5
0
 def http_open(self, request):
     #request -- urllib2.Request
     url = request.get_full_url()
     host = urlsplit(url)[1]
     robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     if not rp.can_fetch(self.agentname, url):
         raise RuntimeError('Forbidden by robots.txt')
     return urllib2.HTTPHandler.http_open(self, request)
Beispiel #6
0
def checkRobots(URL):

    time.sleep(1)
    parsed = urlparse(URL)
    robotsUrl = parsed.scheme + "://" + parsed.netloc + "/robots.txt"
    robotParser = RobotFileParser()
    robotParser.set_url(robotsUrl)
    robotParser.read()
    result = robotParser.can_fetch("*", URL)
    return result
Beispiel #7
0
    def http_open(self, req):

        url = req.get_full_url()
        host = urlsplit(url)[1]
        robots_url = urlunsplit(('http', host, '/robots.txt', '', ''))
        robotfileparser = RobotFileParser(robots_url)
        robotfileparser.read()
        if not robotfileparser.can_fetch(self.crawlername, url):
            raise RuntimeError('Forbidden by robots.txt')
        return urllib2.HTTPHandler.http_open(self, req)
Beispiel #8
0
def checkRobots(URL):

	time.sleep(1)
	parsed = urlparse(URL)
	robotsUrl = parsed.scheme + "://"+ parsed.netloc+"/robots.txt"
	robotParser = RobotFileParser()
	robotParser.set_url(robotsUrl)
	robotParser.read()
	result = robotParser.can_fetch("*",URL)
	return result
Beispiel #9
0
	def can_fetch(self,url):
		host,path=urlparse.urlparse(url)[1:3]
		if	(self.rules.has_key(host)):
			return self.rules[host].can_fetch(self.agent,url)
		else:
			rp=RobotFileParser()
			robot_url="http://"+host+"/robots.txt"
			rp.set_url(robot_url)
			rp.read()
			self.rules[host]=rp
			return rp.can_fetch(self.agent,url)	
Beispiel #10
0
 def can_fetch(self, url):
     host, path = urlparse.urlparse(url)[1:3]
     if (self.rules.has_key(host)):
         return self.rules[host].can_fetch(self.agent, url)
     else:
         rp = RobotFileParser()
         robot_url = "http://" + host + "/robots.txt"
         rp.set_url(robot_url)
         rp.read()
         self.rules[host] = rp
         return rp.can_fetch(self.agent, url)
Beispiel #11
0
 def check_robots(self, url):
     '''check the robots.txt in this url's domain'''
     hostname = urlparse(url).netloc
     if hostname not in self.domain_list.keys():      # no records in domain_list
         rp = RobotFileParser('http://%s/robots.txt' % hostname)
         print("%s: fetching %s" % (url, rp.url))
         try:
             rp.read()                                # get new robots.txt
         except IOError, e:                           # url's server not available(connection timeout)
             log.error(str(e))
             rp.disallow_all = True                   # reject all request
         self.domain_list[hostname] = rp              # add domain entry into domain_list
Beispiel #12
0
class HolidayScrapper:
    def __init__(self):
        self.rp = RobotFileParser()
        self.rp.set_url('https://www.timeanddate.com/robots.txt')
        self.rp.read()
        if not self.rp.can_fetch('WasThereAHoliday', init_url):
            raise RuntimeError('Scrapping forbidden due to robots.txt file')
        self.countries = self.get_countries(self.get_page(init_url))
        try:
            # removing entries which are not countries
            self.countries.remove('un')
        except ValueError:
            pass
        try:
            # removing entries which are not countries
            self.countries.remove('world')
        except ValueError:
            pass

    def get_data(self):
        all_data = pd.DataFrame()
        for cntr in self.countries:
            print 'Fetching data for ' + cntr
            try:
                url = 'https://www.timeanddate.com/holidays/' + cntr + '/2016#!hol=8389401'
                if not self.rp.can_fetch('WasThereAHoliday', url):
                    raise RuntimeError(
                        'Scrapping forbidden due to robots.txt file')
                soup = self.get_page('https://www.timeanddate.com/holidays/' +
                                     cntr + '/2016#!hol=8389401')
                html_table = soup.find('table')
                df_table = pd.read_html(str(html_table))[0]
                df_table['country'] = cntr
                all_data = all_data.append(df_table)
            except ValueError:
                print 'Problem occured when fetching data for ' + cntr
                pass
        return all_data

    @staticmethod
    def get_page(url):
        page = requests.get(url, headers=headers)
        soup = BeautifulSoup(page.text, 'lxml')
        return soup

    @staticmethod
    def get_countries(soup):
        countries = []
        select_list = soup.find(id="co")
        for cntr in select_list.children:
            countries.append(cntr['value'])
        return countries
Beispiel #13
0
def robots_check(url):

    # creating url for robots.txt
    root_url = tld.get_tld(url)
    prefix = "http://www."
    suffix = "/robots.txt"
    robots_url = prefix + root_url + suffix

    # checking url validity
    rp = RobotFileParser()
    rp.set_url(robots_url)
    rp.read()
    return rp.can_fetch("*", url)
Beispiel #14
0
 def try_add_robot(self, url):
     parsed_url = urlparse(url)
     if parsed_url.netloc not in self.robots:
         try:
             robot_url = parsed_url.scheme + '://' + parsed_url.netloc + \
                         '/robots.txt'
             rp = RobotFileParser(robot_url)
             rp.read()
             self.robots[parsed_url.netloc] = rp
         except IOError as e:
             print str(e)
         except Exception as e:
             print str(e)
Beispiel #15
0
 def check_robots(self, url):
     '''check the robots.txt in this url's domain'''
     hostname = urlparse(url).netloc
     if hostname not in self.domain_list.keys(
     ):  # no records in domain_list
         rp = RobotFileParser('http://%s/robots.txt' % hostname)
         print("%s: fetching %s" % (url, rp.url))
         try:
             rp.read()  # get new robots.txt
         except IOError, e:  # url's server not available(connection timeout)
             log.error(str(e))
             rp.disallow_all = True  # reject all request
         self.domain_list[
             hostname] = rp  # add domain entry into domain_list
def can_read(url):

    domain = domain_name(url)
    if domain not in Permissions:
        rp = RobotFileParser()
        rp.set_url(urljoin('http://' + domain, 'robots.txt'))
        try:
            rp.read()
        except:
            return False

        Permissions[domain] = rp

    res = False
    try:
        res = Permissions[domain].can_fetch("*", url)
    except:
        return False

    return res
Beispiel #17
0
    def urlopen(self, host):
        robo_url = host.get_robots_url()

        print self.robotdict

        cached_parser = self.robotdict.get(robo_url)
        if cached_parser:
            logging.info("Found in Cache: " + robo_url)
        else:
            logging.info("Fetching: " + robo_url)
            cached_parser = RobotFileParser()
            self.robotdict.put(robo_url, cached_parser)
            cached_parser.set_url(robo_url)
            cached_parser.read()

        if cached_parser.can_fetch('*', host. get_url()):
            print 'Going to fetch:', host.get_url()
            return self.fetch_file(host.get_url())
        else:
            logging.info("Forbidden by Robots.txt")
            return None
Beispiel #18
0
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
Beispiel #19
0
def _get_soup(path):
    """Gets soup from the given path, respecting robots.txt"""

    full_path = BASE_URL + path

    # Set a user-agent
    user_agent = 'dcnotify/%s' % __version__
    http_headers = {'User-Agent': '%s' % user_agent}

    # Honor robots.txt
    robots = RobotFileParser()
    robots.set_url("%s/robots.txt" % BASE_URL)
    robots.read()
    if not robots.can_fetch(user_agent, full_path):
        raise ValueError("Path disallowed by robots.txt")

    # Make a make a request, raising any HTTP errors that might occur
    request = get(full_path, headers=http_headers)
    request.raise_for_status()

    return bs(request.text)
Beispiel #20
0
class spider(object):
    CurLink = ""
    linknText = []
    headings = []

    def __init__(self, link):
        self.CurLink = link
        self.r = RobotFileParser()

    def crawl(self):
        self.r.set_url(urlparse.unquote(self.CurLink))
        self.r.read()

        self.html = urlopen(self.CurLink).read()
        self.bs = BeautifulSoup(self.html, "lxml")

        for i in self.bs.findAll("h1", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h2", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h3", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h4", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h5", text=True):
            self.headings.append(i.text)
        for i in self.bs.findAll("h6", text=True):
            self.headings.append(i.text)

        for link in self.bs.findAll('a', href=True):
            aLink = urlparse.urljoin(self.CurLink, link['href'])

            if (self.r.can_fetch("*", aLink)):
                self.linknText.append({
                    "URL": aLink,
                    "AnchorText": link.string
                })
    def __init__(self, url):
        self.url = urlManip.cleanURL(url)
        self.pages = []
        self.suggestions = set()
        self.loaded = False
        logger.info("Loading %s..." % (self.url))
        try:
            requests.get(self.url)
            self.loaded = True
        except IOError as e:
            logger.error("%s cannot be loaded: %s" % (self.url, e))

        # if the website can be loaded
        if self.loaded == True:
            logger.info("Load successful. Generating suggestions...")

            # get robots.txt
            rp = RobotFileParser(self.url + "robots.txt")
            try:
                rp.read()
            except IOError:
                logger.warning("robots.txt cannot be found.")

            # get home page
            self.pages.append(Page(self.url))

            # get all pages on homepage
            self.pages[0].load()
            for link in self.pages[0].internalLinks:
                if rp.can_fetch("*", link):
                    if link[:4] == 'http':
                        self.pages.append(Page(link))
                    else:
                        self.pages.append(Page(self.url + link))
                else:
                    logger.debug("Ignoring %s based on robots.txt" % link)
Beispiel #22
0
class Crawler():
	
	# Variables
	parserobots = False
	output 	= None
	report 	= False

	config 	= None
	domain	= ""

	exclude = []
	skipext = []
	drop    = []
	
	debug	= False

	tocrawl = set([])
	crawled = set([])
	excluded = set([])

	marked = {}

	# TODO also search for window.location={.*?}
	linkregex = re.compile(b'<a href=[\'|"](.*?)[\'"].*?>')

	rp = None
	response_code={}
	nb_url=1 # Number of url.
	nb_rp=0 # Number of url blocked by the robots.txt
	nb_exclude=0 # Number of url excluded by extension or word
	
	output_file = None

	target_domain = ""

	def __init__(self, parserobots=False, output=None, report=False ,domain="", exclude=[], skipext=[], drop=[], debug=False):
		self.parserobots = parserobots
		self.output 	= output
		self.report 	= report
		self.domain 	= domain
		self.exclude 	= exclude
		self.skipext 	= skipext
		self.drop		= drop
		self.debug		= debug

		if self.debug:
			logging.basicConfig(level=logging.DEBUG)

		self.tocrawl = set([domain])

		try:
			self.target_domain = urlparse.urlparse(domain)[1]
		except:
			raise ValueError("Invalid domain")


		if self.output:
			try:
				self.output_file = open(self.output, 'w')
			except:
				logging.debug ("Output file not available.")
				exit(255)

	def run(self):
		print (config.xml_header, file if file else self.output_file)

		logging.debug("Start the crawling process")

		while len(self.tocrawl) != 0:
			self.__crawling()

		logging.debug("Crawling as reach the end of all found link")

		print (config.xml_footer, file if file else self.output_file)


	def __crawling(self):
		crawling = self.tocrawl.pop()

		url = urlparse.urlparse(crawling)
		self.crawled.add(crawling)
		request = Request(crawling, headers={"User-Agent":config.crawler_user_agent})
		
		try:
			response = urlopen(request)
		except Exception as e:
			if hasattr(e,'code'):
				if e.code in self.response_code:
					self.response_code[e.code]+=1
				else:
					self.response_code[e.code]=1

				# Gestion des urls marked pour le reporting
				if self.report:
					if e.code in self.marked:
						self.marked[e.code].append(crawling)
					else:
						self.marked[e.code] = [crawling]

			logging.debug ("{1} ==> {0}".format(e, crawling))
			return self.__continue_crawling()

		# Read the response
		try:
			msg = response.read()
			if response.getcode() in self.response_code:
				self.response_code[response.getcode()]+=1
			else:
				self.response_code[response.getcode()]=1

			response.close()

			# Get the last modify date
			if 'last-modified' in response.headers:
				date = response.headers['Last-Modified']
			else:
				date = response.headers['Date']

			date = datetime.strptime(date, '%a, %d %b %Y %H:%M:%S %Z')

		except Exception as e:
			logging.debug ("{1} ===> {0}".format(e, crawling))
			return None


		print ("<url><loc>"+url.geturl()+"</loc><lastmod>"+date.strftime('%Y-%m-%dT%H:%M:%S+00:00')+"</lastmod></url>", file if file else self.output_file)
		if self.output_file:
			self.output_file.flush()

		# Found links
		links = self.linkregex.findall(msg)
		for link in links:
			link = link.decode("utf-8")
			#logging.debug("Found : {0}".format(link))		
			if link.startswith('/'):
				link = 'http://' + url[1] + link
			elif link.startswith('#'):
				link = 'http://' + url[1] + url[2] + link
			elif not link.startswith('http'):
				link = 'http://' + url[1] + '/' + link
			
			# Remove the anchor part if needed
			if "#" in link:
				link = link[:link.index('#')]

			# Drop attributes if needed
			for toDrop in self.drop:
				link=re.sub(toDrop,'',link)

			# Parse the url to get domain and file extension
			parsed_link = urlparse.urlparse(link)
			domain_link = parsed_link.netloc
			target_extension = os.path.splitext(parsed_link.path)[1][1:]

			if (link in self.crawled):
				continue
			if (link in self.tocrawl):
				continue
			if (link in self.excluded):
				continue
			if (domain_link != self.target_domain):
				continue
			if ("javascript" in link):
				continue
			
			# Count one more URL
			self.nb_url+=1

			# Check if the navigation is allowed by the robots.txt
			if (not self.can_fetch(link)):
				self.exclude_link(link)
				self.nb_rp+=1
				continue

			# Check if the current file extension is allowed or not.
			if (target_extension in self.skipext):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			# Check if the current url doesn't contain an excluded word
			if (not self.exclude_url(link)):
				self.exclude_link(link)
				self.nb_exclude+=1
				continue

			self.tocrawl.add(link)
			
		return None

	def __continue_crawling(self):
		if self.tocrawl:
			self.__crawling()

	def exclude_link(self,link):
		if link not in self.excluded:
			self.excluded.add(link)

	def checkRobots(self):
		if self.domain[len(self.domain)-1] != "/":
			self.domain += "/"
		request = Request(self.domain+"robots.txt", headers={"User-Agent":config.crawler_user_agent})
		self.rp = RobotFileParser()
		self.rp.set_url(self.domain+"robots.txt")
		self.rp.read()

	def can_fetch(self, link):
		try:
			if self.parserobots:
				if self.rp.can_fetch("*", link):
					return True
				else:
					logging.debug ("Crawling of {0} disabled by robots.txt".format(link))
					return False

			if not self.parserobots:
				return True

			return True
		except:
			# On error continue!
			logging.debug ("Error during parsing robots.txt")
			return True

	def exclude_url(self, link):
		for ex in self.exclude:
			if ex in link:
				return False
		return True

	def make_report(self):
		print ("Number of found URL : {0}".format(self.nb_url))
		print ("Number of link crawled : {0}".format(len(self.crawled)))
		if self.parserobots:
			print ("Number of link block by robots.txt : {0}".format(self.nb_rp))
		if self.skipext or self.exclude:
			print ("Number of link exclude : {0}".format(self.nb_exclude))

		for code in self.response_code:
			print ("Nb Code HTTP {0} : {1}".format(code, self.response_code[code]))

		for code in self.marked:
			print ("Link with status {0}:".format(code))
			for uri in self.marked[code]:
				print ("\t- {0}".format(uri))
Beispiel #23
0
class SiteMap():

    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> "+str(robotrules))
        self.robotrules = robotrules
        self.site_map = {}                          # map that records the visits of urls, datemodified and assets
        self.network = {}                           # map that maintains the network/graph of webpages visited
                                                    # The intention of this map is for visual rendering using d3.js

        self.unvisited = set([])                    # a set to keep the list of urls yet to be visited
        self.start_page = None                       # the root page, this is used to avoid cycle and keeping crawl
                                                    # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:"+main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False # error reading robot.txt, ignore it forever

    @timeit
    def generate(self, site_map=None):
        """
            This method holds the invoking control of the crawler method and drives the crawling process.
            Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
            and scraping the urls.

            Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
            just url, date-modified tags with dummy frequency and priorities.
        :param site_map: name of the site_map file so as to create xml entries.
        :return:
        """
        while self.unvisited:
            self.crawl()
        # create xml from the site_map dictionary
        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xhtml="http://www.w3.org/1999/xhtml"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
        """
        footer = """\n</urlset>\n"""
        entry = "\t<url>\n\
                 \t\t<loc>%s</loc>\n\
                 \t\t<lastmod>%s</lastmod>\n\
                 \t\t<changefreq>monthly</changefreq>\n\
                 \t\t<priority> 1 </priority>\n\
                 \t</url>\
        "

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url, self.site_map[url]['date']) + "\n"

        xml += footer
        if site_map != None:
            self.write_to_file(site_map, xml)
        else:
            self.write_to_file("sitemap.xml", xml)
        return xml

    def write_to_file(self, file_name, content):
        """
            A utility method to just write the contents of the file into a given file name.
            Alert: This overwrites if the file does exist in the current directory.
        :param file_name: name of the file, sitemap in our case.
        :param content:   contents of the file
        :return: None
        """
        f = open(file_name, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, url, href):
        """
            There are different ways a href could specify a location and it varies in different ways based on how
            the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
            a valid url link so as to keep it ready for the crawl method.
        :param url:   basae url of the current page
        :param href:  one of the hyper links of the page
        :return:      a well formed and valid http link
        """
        if href.startswith('/'):
            return "http://%s%s"%(url.netloc, href)
        elif href.startswith('#'):
            return "http://%s%s%s"%(url.netloc, url.path, href)
        elif href.startswith('./'):
            return "http://%s%s"%(url.netloc, href[1:])
        elif not href.startswith('http'):
            return "http://" + url.netloc + '/' + href

        return href

    def get_out_going_edges(self, url, html_body):
        """
            This method encompasses the BFS along with the coupling with crawl and generator as it changes the state
            of the unvisited map. Basically this method extracts the links that belong to the same domain as the start
            page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps
            like href links pointing to 'javascript', 'mailto' etc.
        :param url:         current page url
        :param html_body:   current page's html content
        :return:            returns all the valid and wellformed out going links from this page
        """
        soup = BeautifulSoup(html_body, "html.parser")
        valid_links_for_this_page = []
        for a in soup.find_all('a', href=True):

            href = a['href']
            href = self.compose_url_from_href(url, href.decode("utf-8"))

            # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints
            href = urldefrag(href)[0]  # skip intra links [this took time to find out !] ##1
            # remove query params as only the path matters
            if href.find('?') != -1:
                href = href[:href.find('?')]  ##2

            new_page = urlparse(href)

            # add to the queue only it it doesn't cause a cycle
            # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete
            if  not str(new_page.netloc).endswith(self.start_page):          # doesn't belong to domain
                continue

            if  self.robot_allows(href) and \
                not href in self.site_map.keys()            and \
                not href in self.unvisited                  and \
                not 'javascript:' in href                   and \
                not 'mailto:' in href:
                self.unvisited.add(href)
                valid_links_for_this_page.append(href)

        return valid_links_for_this_page

    def record_visit(self, url, headers, html_body):
        """
            Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers.
            This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl.
            This method is used to preserve the url crawled and its last-modified time along with assets scraped into
            the container dictionary for later usage to generate sitemap and visualization network.
        :param url:         url of the just finished crawling page
        :param headers:     header information of the crawled page
        :param html_body:   html content of the page
        :return:            None
        """
        if 'last-modified' in headers:
            date = headers['Last-Modified']
        else:
            date = headers['Date']

        self.site_map[url] = {
            'date': date,
            'assets': self.get_static_assets(html_body)
        }

    def get_static_assets(self, html_body):
        """
            A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic
            though they could produce dynamic results. The code or text that exists in these files is constant and
            static. These files are referred as static assets and for the definition of this challenge, I have chosen
            to keep all the info in a single dictionary and extract them at the end for reports, results and stats.
        :param html_body:       html content of the page.
        :return:                returns a dictionary that encompasses .css, .img, ijs files as lists.
        """
        # add static assets of the page .css, .js and image urls may be ?
        soup = BeautifulSoup(html_body, "html.parser")
        img = soup.findAll("img")
        css = soup.findAll("link", {"rel": "stylesheet"})

        # js is tricky: I faced an issue with inline javascript and ignoring it for the time being.
        # an extract like html_body with just needed parts is a must for excluding inline scripts and styles.
        jss = []
        for x in soup.findAll('script'):
            try:
                list.append(x['src'])
            except KeyError:
                pass

        csss = []
        imgs = []
        jss = []
        for link in css:
            csss.append(link['href'])
        for link in img:
            imgs.append(link['src'])
        for link in jss:
            jss.append(link['src'])

        return {
                'css': csss,
                'img': imgs,
                'js':  jss
        }

    def crawl(self):
        """
            The main driver method that crawls the pages. This main does below steps:
            for every unvisited [vertex|page] that belongs to the requested domain:
                crawl the page
                record valid links and their last-modified-dates
        :return:   None
        """
        page = self.unvisited.pop()
        # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler
        # must find all the pages for report.
        logging.info("Starting to Crawl Page: " + page)

        url = urlparse(page)
        try:
            response = urlopen(page)
        except:
            logging.debug("Issue with the url: " + page)
            return None
        try:
            html_body = response.read() # response.getcode()
            response.close()
            # record visit ans assets
            self.record_visit(page, response.headers, html_body)
            logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(len(self.unvisited), len(self.site_map)))
        except:
            logging.debug("Issue while opening url: " + page)
            return None
        connects = self.get_out_going_edges(url, html_body)

        # simple Graph that keeps the order of the pages crawled.
        for i, url in enumerate(connects):
            self.network[page] = {
                'to': connects,
                'assets': {
                    'css': self.site_map[page]['assets']['css'],
                    'js':  self.site_map[page]['assets']['js'],
                    'img': self.site_map[page]['assets']['img']
                }
            }
        return None

    def get_site_map(self):
        """
            Returns the compiled sitemap structure
        :return:       sitemap data structure
        """
        return self.site_map

    def get_network_graph(self):
        """
            Returns the compiled network in the order of the crawled pages
        :return:       network graph
        """
        return self.network

    def get_network_json_format(self):
        """
            Returns the crawl traverse order sequence in json format
        :return:       network in json format
        """
        return json.dumps(self.network)

    def set_start_page(self, url):
        """
            This could be useful if one is testing
        :param url: start page to start the crawling.
        :return:
        """
        self.start_page = url

    def robot_allows(self, link):
        if not self.robotrules: return True
        try:
            if self.robot_txt_rules.can_fetch("*", link):
                    return True
            return False
        except:
            return True
Beispiel #24
0
class spider(object):
    CurLink = ""
    linkURI = []
    texts = []
    Meta = {}

    def __init__(self, link):
        self.CurLink = link
        self.r = RobotFileParser()

    def crawl(self):
        self.r.set_url(urlparse.unquote(self.CurLink))
        self.r.read()

        self.html = urlopen(self.CurLink).read()
        self.bs = BeautifulSoup(self.html, "lxml")

        for script in self.bs(["script", "style"]):
            script.extract()
        text = self.bs.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split("  "))
        for chunk in chunks:
            if chunk:
                self.texts.append(chunk)

        # site = urlparse.urlsplit(self.CurLink).scheme + "://" + urlparse.urlsplit(self.CurLink).netloc + "/sitemap.aspx"
        # r = requests.get(site)
        if requests.get(
                urlparse.urlsplit(self.CurLink).scheme + "://" +
                urlparse.urlsplit(self.CurLink).netloc +
                "/sitemap.aspx").ok == True:
            root = etree.fromstring(
                requests.get(
                    urlparse.urlsplit(self.CurLink).scheme + "://" +
                    urlparse.urlsplit(self.CurLink).netloc +
                    "/sitemap.xml").content)
            for sitemap in root:
                children = sitemap.getchildren()
                self.linkURI.append(children[0].text)
        elif requests.get(
                urlparse.urlsplit(self.CurLink).scheme + "://" +
                urlparse.urlsplit(self.CurLink).netloc +
                "/sitemap.xml").ok == True:
            root = etree.fromstring(
                requests.get(
                    urlparse.urlsplit(self.CurLink).scheme + "://" +
                    urlparse.urlsplit(self.CurLink).netloc +
                    "/sitemap.xml").content)
            for sitemap in root:
                children = sitemap.getchildren()
                self.linkURI.append(children[0].text)
        else:
            for link in self.bs.findAll('a', href=True):
                aLink = urlparse.urljoin(self.CurLink, link['href'])

                if (self.r.can_fetch("*", aLink)):
                    self.linkURI.append(aLink)

        page = metadata_parser.MetadataParser(url=self.CurLink)
        meta = page.metadata

        keyw = "null"
        descr = "null"
        if (meta.get('meta').get('Keywords')):
            keyw = meta['meta']['Keywords'].split(', ')

        if (meta.get('meta').get('Description')):
            descr = meta['meta']['Description']

        self.Meta = {
            'title': meta['page']['title'],
            'url': meta['_internal']['url_actual'],
            'description': descr,
            'keyword': keyw
        }
Beispiel #25
0
class SimpleCrawler:

  USER_AGENT = 'SimpleCrawler/0.1'
  HEADERS = {
    'User-Agent': USER_AGENT,
    'Accept-Encoding': 'gzip',
    'Connection': 'keep-alive'
    }
  CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)
  
  def __init__(self, starturl, index_html='', maxlevel=1,
               cookie_file=None, acldb=None, urldb=None, default_charset=None,
               delay=0, timeout=300, debug=0):
    (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
    assert proto == 'http'
    #Thread.__init__(self)
    self.debug = debug
    self.index_html = index_html
    if cookie_file:
      self.cookiejar = MozillaCookieJar(cookie_file)
      self.cookiejar.load()
    else:
      self.cookiejar = None
    self.robotstxt = RobotFileParser()
    self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
    self.robotstxt.read()
    self.conn = None
    self.urldb = urldb
    self.acldb = acldb
    self.curlevel = 0
    self.delay = delay
    self.timeout = timeout
    self.default_charset = default_charset
    if starturl.endswith('/'):
      starturl += self.index_html
    self.urls = [(starturl, maxlevel)]
    self.crawled = {}                   # 1:injected, 2:crawled
    return

  def accept_url(self, url):
    if url.endswith('/'):
      url += self.index_html
    if self.acldb and not self.acldb.allowed(url):
      return None
    return url
  
  def inject_url(self, url):
    if (not self.curlevel) or (not url) or (url in self.crawled): return False
    if not self.robotstxt.can_fetch(self.USER_AGENT, url):
      if self.debug:
        print >>stderr, 'DISALLOW: %r' % url
      return None
    if self.debug:
      print >>stderr, 'INJECT: %r' % url
    self.crawled[url] = 1
    self.urls.append((url, self.curlevel-1))
    return True

  def get1(self, url, maxretry=3, maxredirect=3):
    if self.debug:
      print >>stderr, 'GET: %r' % url
    # loop
    for rtry in range(maxredirect):
      # forge urllib2.Request object.
      req = Request(url)
      # add cookie headers if necessary.
      if self.cookiejar:
        self.cookiejar.add_cookie_header(req)
        headers = req.unredirected_hdrs
        headers.update(self.HEADERS)
      else:
        headers = self.HEADERS
      # get response.
      for ctry in range(maxretry):
        try:
          if not self.conn:
            print >>stderr, 'Making connection: %r...' % (self.hostport,)
            self.conn = HTTPConnection(self.hostport)
          self.conn.request('GET', req.get_selector().replace(' ',''), '', headers)
	  self.conn.sock.settimeout(self.timeout)
          resp = self.conn.getresponse()
          break
        except BadStatusLine, x:
          # connection closed unexpectedly
          print >>stderr, 'Connection closed unexpectedly.'
          # it restarts the connection...
          self.conn.close()
          self.conn = None
        except socket.error, x:
          # connection closed unexpectedly
          print >>stderr, 'Socket error:', x
          self.conn.close()
          self.conn = None
      else:
Beispiel #26
0
class Webpage(object):
    """
    Objects that refer to individual webpages. If the url is scrapeable the
    object will be filled with that data, indexed, and inserted into a database
    to be searched.
    """
    number_of_scraped_pages = 0

    def __init__(self, url):
        """
        Creates a webpage object and assigns it the provided url.
        """
        self.url = url
        if self.url not in black_list and self.url not in scraped_urls:
            self.needs_to_be_scraped = True
        else:
            self.needs_to_be_scraped = False

    def page_robot_scannable(self):
        """
        Checks whether the page is allowed to be crawled
        """
        if self.need_to_be_scraped is True:
            # REFACTOR to remove try statement.
            try:
                headers = {'User-agent':settings.SPIDER_USER_AGENT}
                self.urlparse = urlparse.urlparse(self.url)
                self.robotcheck = RobotFileParser()
                self.robotcheck.set_url('http://'+self.urlparse[1]+'/robots.txt') # Only works with http right now.
                self.robotcheck.read()
                self.need_to_be_scraped = self.robotcheck.can_fetch(settings.SPIDER_USER_AGENT, self.url)
            except:
                self.need_to_be_scraped = False

    def get_page(self):
        """
        The url is requested with a GET request. The page html is scraped
        directly, while elements of it aee scraped in parse_page
        """
        self.headers = {'User-agent':settings.SPIDER_USER_AGENT}
        #REFACTOR to remove try
        try:
            self.request = requests.get(self.url, headers=headers)
            self.pagehtml = BeautifulSoup(self.request.text) #REFACTOR, don't use BeautifulSoup
            self.count = self.instanceID.next()
            Webpage.number_of_scraped_pages += 1
        except:
            raise Exception

    def get_visible_elements(self, element):
        """
        Checks that the element is not contained in <style>, <script>, <head>,
        <title> or [document]. It also cannot be commented out.
        """
        if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
            return False
        elif re.match('<!--.*-->', str(element)):
            return False
        return True

    def parse_page(self):
        """
        This method parses the HTML page and extracts the title of the page,
        the outgoing links, the number of outgoing links, and the text.
        """
        self.title = self.pagehtml.find('title').text
        self.page_text = self.pagehtml.findAll(text=true)

        for item in filter(get_visible_elements, self.pagetext):
            if item != '\n':
                self.pagetext+= item
        self.pagelinks = {}

        for link in soup.findAll('a'):
            self.pagelinks[link.get('href')] = 1

        for link in self.pagehtml:
            pass

        # determine if link is relative or absolute. if relative, change it to absolute

    def inverted_index_page_text(self):
        """
        Iterates through the words in the page text and creates and adds them
        to an index.
        """
        self.pagetextlist = self.pagetext.split(' ') #Noted error: This catches punctuation along with words.
        for index, word in enumerate(self.pagetextlist):
            if word not in STOP_WORDS:
                if not inverted_index.get(word):
                    inverted_index[word]={'url':self.url,'offsets':[index]}
                else:
                    inverted_index[word]['offsets'].append(index)

    def set_page_scraped(self):
        """
        Once the page is scraped it is flagged as such
        """
        self.needs_to_be_scraped = False
Beispiel #27
0
 def _create_robot_file_parser(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     return rp
Beispiel #28
0
class MarioDepth:
    def __init__(self, starturl, callback, callpre=None, callfail=None, concount=MAXCONCOUNT, depth=2, accept_url_patterns=None, reject_url_patterns=None):
        self.concount = concount
        self.callback = callback
        self.callpre = callpre
        self.callfail = callfail
        self.depth = depth
        self.starturl = starturl
        self.baseurl = URL.baseurl(starturl)
        self.urls = []
        self.crawled = {}
        self.link_title_db = LinkTitleDB()
        self.accept_url_patterns = accept_url_patterns
        self.reject_url_patterns = reject_url_patterns
        self.robotstxt = RobotFileParser()
        self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
        self.referer = starturl
        try:
            self.robotstxt.read()
        except:
            logger.debug(Traceback())
        #self.lightcloud = LightCloud.connect('n0')
    
    def __call__(self, n=None):
        if n: self.concount = n
        current_depth = self.depth
        self.urls.append((self.starturl, current_depth))
        while self.urls:
            self.depth_get()
            logger.debug('%d unprocessed urls'%(len(self.urls)))
    
    def depth_get(self):
        mario = MarioBatch(callback=self.next_depth, callpre=self.callpre, callfail=self.callfail)
        pool = coros.CoroutinePool(max_size=len(self.urls))
        while self.urls:
            waiters = []
            #self.add_job(mario)
            counter = 0
            while self.urls:
                if counter > 9: break;
                counter += 1
                waiters.append(pool.execute(self.add_job, mario))
            logger.debug('Depth break')
            for waiter in waiters:
                waiter.wait()
            mario(self.concount)
    
    def add_job(self, mario):
        if not self.urls: return
        url, depth = self.urls.pop()
        if self.visited(url, depth): return
        mario.add_job(url, args=depth)
        
    def visited(self, url, depth):
        #is_duplicate = URL.is_duplicate(url, self.lightcloud)
        return depth==0 and is_duplicate or depth < self.depth and self.crawled.has_key(url) and self.crawled[url] == 2
    
    def next_depth(self, response):
        #with_timeout(1, self.lightcloud.set, LightCloud.crawled_url_key(response.effective_url), response.url, timeout_value=None)
        for link, title in URL.link_title(response.body, response.effective_url):
            if not self.inject_url(link, response.args):continue
            self.link_title_db.add(link, response.effective_url, title)
        if callable(self.callback): self.callback(response)
        self.crawled[response.effective_url] = 2
        if response.effective_url != response.url:
            self.crawled[response.url] = 2
        self.referer = response.effective_url
    
    def inject_url(self, url, depth):
        if not (depth and url and url not in self.crawled): 
            #logger.debug('IGNORE(%d): %r'%(depth, url))
            return None
        if isinstance(url, unicode): url = url.encode('utf-8')
        if self.reject_url(url): 
            logger.debug('REJECT: %r' % url)
            return None
        try:
            can_fetch = self.robotstxt.can_fetch(USER_AGENT['safari'], url)
        except:
            can_fetch = True
        if self.baseurl!='http://hi.baidu.com/' and not can_fetch:
            logger.debug('DISALLOW: %r' % url)
            return None
        logger.debug('INJECT(%d): %r' % (depth-1, url))
        self.crawled[url] = 1
        self.urls.append((url, depth-1))
        return True
    
    def reject_url(self, url):
        return self.baseurl != URL.baseurl(url) and (not self.accept_url_patterns or not re.match('|'.join(self.accept_url_patterns), url) or self.reject_url_patterns or re.match('|'.join(self.reject_url_patterns), url))
        
Beispiel #29
0
class WebPage(object):
    def __init__(self, url):
        self.page_url = url
        self.parsed_url = urlparse.urlparse(url)
        self.lang = ""
        self.isDownload = False
        self.title = ""
        self.text = ""
        self.soup = None
        self.robot = RobotFileParser()

    def __normalize_link__(self, link):
        if not link:
            return None
        if link.startswith('//'):
            return self.parsed_url.scheme + ':' + link
        elif link.startswith('/'):
            return self.parsed_url.scheme + '://' + self.parsed_url.hostname + link
        elif link.startswith('http://') or link.startswith('https://'):
            return link
        elif link.startswith("irc://"):
            return None
        elif link.startswith('#') or link.startswith('javascript:'):
            return None
        else:
            return urlparse.urljoin(self.page_url, link)

    def __delete_unnecessary_tags(self):
        if self.soup is None:
            return

        if self.soup.title is None:
            self.title = ""
        else:
            self.title = self.soup.title.string

        for tag in self.soup(
            ['style', 'script', '[document]', 'head', 'title']):
            tag.decompose()

    def __get_stems(self, text):
        if self.lang in LANGUAGES:
            stemer = snowballstemmer.stemmer(LANGUAGES[self.lang])
        else:
            raise NotImplementedError("That lang not implemented")
        stems_dict = dict()

        for char in [",", ". ", "!", "?", " - ", "/n"]:
            text = text.replace(char, " ")

        for word in text.split():
            stem_word = stemer.stemWord(word.lower())
            if stem_word in stems_dict:
                stems_dict[stem_word] += 1
            else:
                stems_dict[stem_word] = 1

        return stems_dict

    def download_page(self):
        try:
            self.robot.set_url("{0}://{1}/robots.txt".format(
                self.parsed_url.scheme, self.parsed_url.hostname))
            self.robot.read()
            if self.robot.can_fetch("*", self.page_url):
                response = requests.get(self.page_url, verify=False)
            else:
                return False
        except requests.exceptions.InvalidSchema:
            return False
        except KeyError:
            return False
        except Exception:
            return False

        if response.status_code == 200:
            self.soup = BeautifulSoup(response.text, "html.parser")
            self.__delete_unnecessary_tags()
            self.text = "".join(self.soup.strings)
            try:
                self.lang = detect(self.text)
            except Exception:
                self.lang = "en"
            self.isDownload = True
            return True
        else:
            return False

    def get_links(self):
        if not self.isDownload:
            raise Exception("You should download page")

        def get_links_generator():
            for link in self.soup.find_all("a"):
                normalized_link = self.__normalize_link__(link.get("href"))
                if normalized_link is None:
                    continue
                else:
                    yield normalized_link

        return get_links_generator()

    def get_text_stems(self):
        if not self.isDownload:
            raise Exception("You should download page")
        return self.__get_stems(self.text)

    def get_title_stems(self):
        if not self.isDownload:
            raise Exception("You should download page")
        return self.__get_stems(self.title)

    def get_domain(self):
        return self.parsed_url.hostname
Beispiel #30
0
	def test_parse(self):
		from robotparser import RobotFileParser
		rules=RobotFileParser()
		rules.set_url("http://www.sogou.com/robots.txt")
		rules.read()
		self.assertEqual(rules.can_fetch("mozilla","http://www.sogou.com/sohu/robots.txt"),False)
Beispiel #31
0
class SiteMap():
    def __init__(self, main_page=None, robotrules=True):
        """
            Constuctor method that initializes the members that are used during crawling process
        :param main_page: The root page that needs to be crawled for generation of sitemap
        """

        logging.info("Consider Robot.txt ? ==> " + str(robotrules))
        self.robotrules = robotrules
        self.site_map = {
        }  # map that records the visits of urls, datemodified and assets
        self.network = {
        }  # map that maintains the network/graph of webpages visited
        # The intention of this map is for visual rendering using d3.js

        self.unvisited = set(
            [])  # a set to keep the list of urls yet to be visited
        self.start_page = None  # the root page, this is used to avoid cycle and keeping crawl
        # process limited to single domain.
        self.robot_txt_rules = None

        if main_page:
            self.unvisited.add(main_page)
            try:
                self.start_page = urlparse(main_page).netloc
            except:
                logging.error("Improper URL, Please provide a Valid Url:" +
                              main_page)
                exit(0)

        if self.robotrules == "True":
            try:
                logging.info("robot.txt respected")
                self.robot_txt_rules = RobotFileParser()
                self.robot_txt_rules.set_url(main_page + "/robots.txt")
                self.robot_txt_rules.read()
            except:
                logging.error("Unable to read the robot.txt file")
                self.robotrules = False  # error reading robot.txt, ignore it forever

    @timeit
    def generate(self, site_map=None):
        """
            This method holds the invoking control of the crawler method and drives the crawling process.
            Basically a BFS style method that keeps popping the elements from the queue [self.unvisited set]
            and scraping the urls.

            Once the crawling process is done, this creates sitemap using the self.site_map dictionary with
            just url, date-modified tags with dummy frequency and priorities.
        :param site_map: name of the site_map file so as to create xml entries.
        :return:
        """
        while self.unvisited:
            self.crawl()
        # create xml from the site_map dictionary
        header = """<?xml version="1.0" encoding="UTF-8"?>
            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
            xmlns:xhtml="http://www.w3.org/1999/xhtml"
            xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
            xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
            http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
        """
        footer = """\n</urlset>\n"""
        entry = "\t<url>\n\
                 \t\t<loc>%s</loc>\n\
                 \t\t<lastmod>%s</lastmod>\n\
                 \t\t<changefreq>monthly</changefreq>\n\
                 \t\t<priority> 1 </priority>\n\
                 \t</url>\
        "

        xml = header
        for url in self.site_map.keys():
            xml += entry % (url, self.site_map[url]['date']) + "\n"

        xml += footer
        if site_map != None:
            self.write_to_file(site_map, xml)
        else:
            self.write_to_file("sitemap.xml", xml)
        return xml

    def write_to_file(self, file_name, content):
        """
            A utility method to just write the contents of the file into a given file name.
            Alert: This overwrites if the file does exist in the current directory.
        :param file_name: name of the file, sitemap in our case.
        :param content:   contents of the file
        :return: None
        """
        f = open(file_name, 'w')
        f.write(content)
        f.close()

    def compose_url_from_href(self, url, href):
        """
            There are different ways a href could specify a location and it varies in different ways based on how
            the page is designed. This method takes few styles into consideration and ignores some, cleans and creates
            a valid url link so as to keep it ready for the crawl method.
        :param url:   basae url of the current page
        :param href:  one of the hyper links of the page
        :return:      a well formed and valid http link
        """
        if href.startswith('/'):
            return "http://%s%s" % (url.netloc, href)
        elif href.startswith('#'):
            return "http://%s%s%s" % (url.netloc, url.path, href)
        elif href.startswith('./'):
            return "http://%s%s" % (url.netloc, href[1:])
        elif not href.startswith('http'):
            return "http://" + url.netloc + '/' + href

        return href

    def get_out_going_edges(self, url, html_body):
        """
            This method encompasses the BFS along with the coupling with crawl and generator as it changes the state
            of the unvisited map. Basically this method extracts the links that belong to the same domain as the start
            page, cleans them with compose_url_from_href method and updates the map. This also avoids unnecessary traps
            like href links pointing to 'javascript', 'mailto' etc.
        :param url:         current page url
        :param html_body:   current page's html content
        :return:            returns all the valid and wellformed out going links from this page
        """
        soup = BeautifulSoup(html_body, "html.parser")
        valid_links_for_this_page = []
        for a in soup.find_all('a', href=True):

            href = a['href']
            href = self.compose_url_from_href(url, href.decode("utf-8"))

            # clean the href so that it will have legitimate urls instead of #cluttered ones and q=param prints
            href = urldefrag(href)[
                0]  # skip intra links [this took time to find out !] ##1
            # remove query params as only the path matters
            if href.find('?') != -1:
                href = href[:href.find('?')]  ##2

            new_page = urlparse(href)

            # add to the queue only it it doesn't cause a cycle
            # assumption: if a link ends with domain.com, assuming it can be crawled to make sitemap complete
            if not str(new_page.netloc).endswith(
                    self.start_page):  # doesn't belong to domain
                continue

            if  self.robot_allows(href) and \
                not href in self.site_map.keys()            and \
                not href in self.unvisited                  and \
                not 'javascript:' in href                   and \
                not 'mailto:' in href:
                self.unvisited.add(href)
                valid_links_for_this_page.append(href)

        return valid_links_for_this_page

    def record_visit(self, url, headers, html_body):
        """
            Any time a specific url of a site is changed, its last-modified date and time are kept in the page headers.
            This info helps bots and crawlers to not to crawl the page if it has not been updated since last crawl.
            This method is used to preserve the url crawled and its last-modified time along with assets scraped into
            the container dictionary for later usage to generate sitemap and visualization network.
        :param url:         url of the just finished crawling page
        :param headers:     header information of the crawled page
        :param html_body:   html content of the page
        :return:            None
        """
        if 'last-modified' in headers:
            date = headers['Last-Modified']
        else:
            date = headers['Date']

        self.site_map[url] = {
            'date': date,
            'assets': self.get_static_assets(html_body)
        }

    def get_static_assets(self, html_body):
        """
            A html page could contain other links such as .css, .img. .mp4 and .js. All these files are not dynamic
            though they could produce dynamic results. The code or text that exists in these files is constant and
            static. These files are referred as static assets and for the definition of this challenge, I have chosen
            to keep all the info in a single dictionary and extract them at the end for reports, results and stats.
        :param html_body:       html content of the page.
        :return:                returns a dictionary that encompasses .css, .img, ijs files as lists.
        """
        # add static assets of the page .css, .js and image urls may be ?
        soup = BeautifulSoup(html_body, "html.parser")
        img = soup.findAll("img")
        css = soup.findAll("link", {"rel": "stylesheet"})

        # js is tricky: I faced an issue with inline javascript and ignoring it for the time being.
        # an extract like html_body with just needed parts is a must for excluding inline scripts and styles.
        jss = []
        for x in soup.findAll('script'):
            try:
                list.append(x['src'])
            except KeyError:
                pass

        csss = []
        imgs = []
        jss = []
        for link in css:
            csss.append(link['href'])
        for link in img:
            imgs.append(link['src'])
        for link in jss:
            jss.append(link['src'])

        return {'css': csss, 'img': imgs, 'js': jss}

    def crawl(self):
        """
            The main driver method that crawls the pages. This main does below steps:
            for every unvisited [vertex|page] that belongs to the requested domain:
                crawl the page
                record valid links and their last-modified-dates
        :return:   None
        """
        page = self.unvisited.pop()
        # if robot.txt is defined, use Disallow to avoid pages. domain.robot.txt doesn't exist so the crawler
        # must find all the pages for report.
        logging.info("Starting to Crawl Page: " + page)

        url = urlparse(page)
        try:
            response = urlopen(page)
        except:
            logging.debug("Issue with the url: " + page)
            return None
        try:
            html_body = response.read()  # response.getcode()
            response.close()
            # record visit ans assets
            self.record_visit(page, response.headers, html_body)
            logging.debug("Queued Pages: {0}, Crawled Pages: {1}".format(
                len(self.unvisited), len(self.site_map)))
        except:
            logging.debug("Issue while opening url: " + page)
            return None
        connects = self.get_out_going_edges(url, html_body)

        # simple Graph that keeps the order of the pages crawled.
        for i, url in enumerate(connects):
            self.network[page] = {
                'to': connects,
                'assets': {
                    'css': self.site_map[page]['assets']['css'],
                    'js': self.site_map[page]['assets']['js'],
                    'img': self.site_map[page]['assets']['img']
                }
            }
        return None

    def get_site_map(self):
        """
            Returns the compiled sitemap structure
        :return:       sitemap data structure
        """
        return self.site_map

    def get_network_graph(self):
        """
            Returns the compiled network in the order of the crawled pages
        :return:       network graph
        """
        return self.network

    def get_network_json_format(self):
        """
            Returns the crawl traverse order sequence in json format
        :return:       network in json format
        """
        return json.dumps(self.network)

    def set_start_page(self, url):
        """
            This could be useful if one is testing
        :param url: start page to start the crawling.
        :return:
        """
        self.start_page = url

    def robot_allows(self, link):
        if not self.robotrules: return True
        try:
            if self.robot_txt_rules.can_fetch("*", link):
                return True
            return False
        except:
            return True
Beispiel #32
0
class SimpleCrawler:

    USER_AGENT = 'SimpleCrawler/0.1'
    HEADERS = {
        'User-Agent': USER_AGENT,
        'Accept-Encoding': 'gzip',
        'Connection': 'keep-alive'
    }
    CONTENT_TYPE_PAT = re.compile(r'([^\s;]+)(.*charset=([^\s;]+))?', re.I)

    def __init__(self,
                 starturl,
                 index_html='',
                 maxlevel=1,
                 cookie_file=None,
                 acldb=None,
                 urldb=None,
                 default_charset=None,
                 delay=0,
                 timeout=300,
                 debug=0):
        (proto, self.hostport, _x, _y, _z) = urlsplit(starturl)
        assert proto == 'http'
        #Thread.__init__(self)
        self.debug = debug
        self.index_html = index_html
        if cookie_file:
            self.cookiejar = MozillaCookieJar(cookie_file)
            self.cookiejar.load()
        else:
            self.cookiejar = None
        self.robotstxt = RobotFileParser()
        self.robotstxt.set_url(urljoin(starturl, '/robots.txt'))
        self.robotstxt.read()
        self.conn = None
        self.urldb = urldb
        self.acldb = acldb
        self.curlevel = 0
        self.delay = delay
        self.timeout = timeout
        self.default_charset = default_charset
        if starturl.endswith('/'):
            starturl += self.index_html
        self.urls = [(starturl, maxlevel)]
        self.crawled = {}  # 1:injected, 2:crawled
        return

    def accept_url(self, url):
        if url.endswith('/'):
            url += self.index_html
        if self.acldb and not self.acldb.allowed(url):
            return None
        return url

    def inject_url(self, url):
        if (not self.curlevel) or (not url) or (url in self.crawled):
            return False
        if not self.robotstxt.can_fetch(self.USER_AGENT, url):
            if self.debug:
                print >> stderr, 'DISALLOW: %r' % url
            return None
        if self.debug:
            print >> stderr, 'INJECT: %r' % url
        self.crawled[url] = 1
        self.urls.append((url, self.curlevel - 1))
        return True

    def get1(self, url, maxretry=3, maxredirect=3):
        if self.debug:
            print >> stderr, 'GET: %r' % url
        # loop
        for rtry in range(maxredirect):
            # forge urllib2.Request object.
            req = Request(url)
            # add cookie headers if necessary.
            if self.cookiejar:
                self.cookiejar.add_cookie_header(req)
                headers = req.unredirected_hdrs
                headers.update(self.HEADERS)
            else:
                headers = self.HEADERS
            # get response.
            for ctry in range(maxretry):
                try:
                    if not self.conn:
                        print >> stderr, 'Making connection: %r...' % (
                            self.hostport, )
                        self.conn = HTTPConnection(self.hostport)
                    self.conn.request('GET',
                                      req.get_selector().replace(' ', ''), '',
                                      headers)
                    self.conn.sock.settimeout(self.timeout)
                    resp = self.conn.getresponse()
                    break
                except BadStatusLine, x:
                    # connection closed unexpectedly
                    print >> stderr, 'Connection closed unexpectedly.'
                    # it restarts the connection...
                    self.conn.close()
                    self.conn = None
                except socket.error, x:
                    # connection closed unexpectedly
                    print >> stderr, 'Socket error:', x
                    self.conn.close()
                    self.conn = None
            else:
Beispiel #33
0
class CrawlerProcess:
    def __init__(self, index, lock):
        self.current_process_id = index

        self.lock = lock

        number_of_retries = 0

        #print("[CREATED CRAWLER PROCESS]", self.current_process_id)

        # Create the chrome driver with which we will fetch and parse sites
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument("headless")
        self.driver = webdriver.Chrome(chrome_options=chrome_options)
        """
            site is a dictionary with all the fields from the database (domain, robots_content, sitemap_content)
        """
        self.site = None
        """
            robots_parser is an object which allows us to use the robots.txt file
        """
        self.robots_parser = None
        """
            Holds all the pages which will be added to the frontier at the end of each run
        """
        self.pages_to_add_to_frontier = []
        """
            current_page is a dictionary with an id (database id for updating) and url field
        """
        self.current_page = database_handler.get_page_from_frontier(self.lock)
        """
            If a page was fetched from the frontier the crawler can continue, otherwise try again in DELAY seconds
            
            If the frontier is still empty after MAX_NUMBER_OF_RETRIES was reached, we can assume that the frontier is
            really empty and no crawler process is going to insert new pages
        """
        while self.current_page or number_of_retries < MAX_NUMBER_OF_RETRIES:
            if self.current_page:
                number_of_retries = 0

                try:
                    self.crawl()
                except Exception as error:
                    print(
                        "[CRAWLER PROCESS] An unhandled error occurred while parsing page: {}"
                        .format(self.current_page["url"]), error)
            else:
                # No page was fetched from the frontier, try again in DELAY seconds
                number_of_retries += 1

                print(
                    "[CRAWLER PROCESS] Frontier is empty, retrying in 10 seconds",
                    self.current_process_id)

                time.sleep(DELAY)

            # Reset all variables after a page was successfully transferred from the frontier

            self.current_page = database_handler.get_page_from_frontier(
                self.lock)

            self.site = None

            self.robots_parser = None

            self.pages_to_add_to_frontier = []

        self.quit()

        print(
            "[STOPPED CRAWLER PROCESS] Frontier is empty after several tries",
            self.current_process_id)

    def crawl(self):
        #print(" {} - [CRAWLING PAGE]".format(self.current_process_id), self.current_page["url"])

        domain = self.get_domain_url(self.current_page["url"])

        self.site = database_handler.get_site(domain)

        if self.site is None:
            self.create_site(domain)
        else:
            if self.site["robots_content"] is not None:
                # Create robots_parser from robots.txt saved in the database
                self.parse_robots(self.site["robots_content"])

        self.current_page["site_id"] = self.site["id"]

        self.current_page["accessed_time"] = datetime.now()

        if self.allowed_to_crawl_current_page(
                self.current_page["url"]) is False:
            #print("     [CRAWLING] Robots do not allow this page to be crawled: {}".format(self.current_page["url"]))

            self.current_page["page_type_code"] = PAGE_TYPES["disallowed"]

            self.current_page["http_status_code"] = 500

            database_handler.remove_page_from_frontier(self.current_page)

            return
        else:
            # If a crawl delay is available in robots wait until the page can be crawled then continue
            self.wait_for_crawl_delay_to_elapse()

        # The crawler is allowed to crawl the current site, therefore we can perform a request
        page_response = self.fetch_response(self.current_page["url"])

        if page_response:
            # No errors while fetching the response

            content_type = ""

            if "content-type" in page_response.headers:
                # Content type is not necessarily always present (e. g. when Transfer-Encoding is set)
                content_type = page_response.headers['content-type']

            self.current_page["http_status_code"] = page_response.status_code

            if CONTENT_TYPES["HTML"] in content_type:
                # We got an HTML page

                html_content = self.fetch_rendered_page_source(
                    self.current_page["url"])

                if html_content is not None:
                    if self.is_duplicate_page(html_content):
                        print(
                            "     [CRAWLING] Found page duplicate, that has already been parsed: ",
                            self.current_page["url"])

                        self.current_page["page_type_code"] = PAGE_TYPES[
                            "duplicate"]

                        self.current_page[
                            "hash_content"] = hash_driver.create_content_hash(
                                html_content)

                    else:
                        # page is not treated as duplicate page - insert hash signature to db
                        database_handler.insert_page_signatures(
                            self.current_page["id"],
                            self.current_page["hash_signature"])

                        self.current_page["page_type_code"] = PAGE_TYPES[
                            "html"]

                        self.current_page["html_content"] = html_content

                        self.current_page[
                            "hash_content"] = hash_driver.create_content_hash(
                                html_content)

                        parsed_page = self.parse_page(
                            self.current_page["html_content"])

                        if len(parsed_page['links']):
                            for link in parsed_page['links']:
                                self.add_page_to_frontier_array(link)

                        if len(parsed_page['images']):
                            for image_url in parsed_page['images']:
                                self.add_page_to_frontier_array(image_url)
                else:
                    # An error occurred while rendering page

                    self.current_page["page_type_code"] = PAGE_TYPES["error"]

                    self.current_page["http_status_code"] = 500

            elif CONTENT_TYPES["IMG"] in content_type:
                # We can be pretty sure that we have an image

                self.current_page["page_type_code"] = PAGE_TYPES["image"]

                filename = self.get_image_filename(self.current_page["url"])

                image_data = {
                    "page_id": self.current_page["id"],
                    "content_type": content_type,
                    "data": page_response.content,
                    "data_size": len(page_response.content),
                    "accessed_time": datetime.now(),
                    "filename": filename
                }

                database_handler.insert_image_data(image_data)

            else:
                # The crawler detected a non-image binary file

                self.current_page["page_type_code"] = PAGE_TYPES["binary"]

                data_type_code = None

                # Find the correct data_type_code from all the content types
                for code, value in CONTENT_TYPES.items():
                    if content_type == value:
                        data_type_code = code

                if data_type_code is None:
                    # The content type is not in the allowed values, therefore we can ignore it
                    testing = None
                    #print("     [CRAWLING] Page response content-type is not in CONTENT_TYPES: ", content_type)
                else:
                    page_data = {
                        "page_id": self.current_page["id"],
                        "data_type_code": data_type_code,
                        "data": page_response.content,
                        "data_size": len(page_response.content)
                    }

                    database_handler.insert_page_data(page_data)

        else:
            # An error occurred while fetching page (SSL certificate error, timeout, etc.)

            self.current_page["page_type_code"] = PAGE_TYPES["error"]

            self.current_page["http_status_code"] = 500

        # Update the page in the database, remove FRONTIER type and replace it with the correct one
        database_handler.remove_page_from_frontier(self.current_page)

        # Add all the links from the page and sitemap to the frontier
        database_handler.add_pages_to_frontier(self.pages_to_add_to_frontier)

        #print(" {} - [CRAWLING] Finished crawling".format(self.current_process_id))

    """
        Fetch a response from the url, so that we get the status code and find out if any errors occur while fetching
        (some sites for example require a certificate to connect, some sites timeout, etc.)
    """

    def fetch_response(self, url):
        try:
            response = requests.get(url)

            return response
        except requests.exceptions.RequestException as exception:
            print("     [CRAWLING - ERROR]", exception)

            return None

    """
        Create a new site object and insert it into the database
    """

    def create_site(self, domain):
        # We need to create a new site object

        self.site = {"domain": domain}

        robots_content = self.fetch_robots(domain)

        sitemap_content = None

        if robots_content is not None:
            # Create robots_parser from fetched robots.txt
            self.parse_robots(robots_content)

            sitemaps = self.robots_parser.get_sitemaps()

            if len(sitemaps) > 0:
                for sitemap_url in sitemaps:
                    sitemap_content = self.fetch_sitemap(sitemap_url)

                    if sitemap_content is not None:
                        self.parse_sitemap(sitemap_content)

        self.site["robots_content"] = robots_content
        self.site["sitemap_content"] = sitemap_content

        # Insert the new site into database and return the id
        self.site["id"] = database_handler.insert_site(self.site)

    """
        Fetch and render the site in the chrome driver then return the resulting html so that it can be saved in the 
        current page html_content
    """

    def fetch_rendered_page_source(self, url):
        try:
            self.driver.get(url)

            return self.driver.page_source
        except Exception as error:
            print("     [CRAWLING] Error while fetching rendered page source",
                  error)

            return None

    """
        Get the domain name of the current site so that we can check if the site is already in the database or if we
        have to create it
    """

    def get_domain_url(self, url):
        parsed_uri = urlparse(url)

        return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)

    """
        Get the filename from an online image resource
        https://stackoverflow.com/questions/10552188/python-split-url-to-find-image-name-and-extension
    """

    def get_image_filename(self, image_url):
        filename = image_url.split('/')[-1]

        return filename

    def fetch_robots(self, domain):
        response = self.fetch_response(domain + "/robots.txt")

        # We need to check if the returned file is actually a txt file, because some sites route back to the index page
        if response and response.status_code is 200 and "text/plain" in response.headers[
                'content-type']:
            return response.text

        return None

    def fetch_sitemap(self, sitemap_url):
        response = self.fetch_response(sitemap_url)

        if response and response.status_code is 200:
            # Sitemap found
            return response.text

        return None

    """
        This function parses the robots.txt from memory using the modified robotparser class
        The self.robots_parser includes functions to check if the parser is allowed to parse a certain site
    """

    def parse_robots(self, robots_text):
        self.robots_parser = RobotFileParser(robots_text)
        self.robots_parser.read()

    """
        https://stackoverflow.com/questions/31276001/parse-xml-sitemap-with-python
        
        This only works for the standard XML sitemap
    """

    def parse_sitemap(self, sitemap_xml):
        try:
            soup = BeautifulSoup(sitemap_xml, 'lxml')

            sitemap_tags = soup.find_all("loc")

            if sitemap_tags is None:
                return

            for sitemap_tag in sitemap_tags:
                url = self.get_parsed_url(sitemap_tag.text)

                if url:
                    self.add_page_to_frontier_array(url)
        except Exception as error:
            print(error)

    """
        Checks if robots are set for the current site and if they allow the crawling of the current page
    """

    def allowed_to_crawl_current_page(self, url):
        if self.robots_parser is not None:
            return self.robots_parser.can_fetch('*', url)

        return True

    """
        Checks if crawl-delay property is set and if it exists check if the required time has elapsed
    """

    def wait_for_crawl_delay_to_elapse(self):
        try:
            if self.robots_parser is not None:
                crawl_delay = self.robots_parser.crawl_delay('*')

                if crawl_delay is not None:
                    if "last_crawled_at" in self.site and self.site[
                            "last_crawled_at"] is not None:
                        site_last_crawled_at = self.site["last_crawled_at"]

                        can_crawl_again_at = site_last_crawled_at + timedelta(
                            seconds=crawl_delay)

                        current_time = datetime.now()

                        time_difference = (can_crawl_again_at -
                                           current_time).total_seconds()

                        if time_difference > 0:
                            #print("     [CRAWLING] Crawl delay has not yet elapsed for site: {}".format(
                            #   self.site["domain"]))

                            time.sleep(crawl_delay)
        except Exception as error:
            print("     [CRAWLING] Error while handling crawl delay", error)

    """
        Use the chrome driver to fetch all links and image sources in the rendered page (the driver already returns 
        absolute urls)
        
        Note: Sometimes throws StaleElementReferenceException, need to check what that's about. The exception itself 
        just means that the desired element is no longer rendered in DOM. Maybe the memory was getting low, since I got the
        error when I was running 10 crawler processes.
    """

    def parse_page(self, html_content):
        links = []
        images = []

        try:
            browser = self.driver

            anchor_tags = browser.find_elements_by_tag_name("a")

            for anchor_tag in anchor_tags:
                href = anchor_tag.get_attribute("href")

                url = self.get_parsed_url(href)

                if url:
                    links.append(url)

            image_tags = browser.find_elements_by_tag_name("img")

            for image_tag in image_tags:
                src = image_tag.get_attribute("src")

                if src:
                    image_url = self.get_parsed_image_url(src)

                    if image_url:
                        images.append(image_url)

            soup = BeautifulSoup(html_content, 'html.parser')

            script_tags = soup.findAll('script')

            for script_tag in script_tags:
                links_from_javascript = self.parse_links_from_javacript(
                    script_tag.text)

                for link in links_from_javascript:
                    links.append(self.get_parsed_url(link))

            return {"links": links, "images": images}
        except Exception as error:
            print("[ERROR WHILE RENDERING WITH WEB DRIVER]", error)

            return {"links": links, "images": images}

    """
        Find all the hrefs that are set in javascript code (window.location changes)
    """

    def parse_links_from_javacript(self, javascript_text):
        links = []

        try:
            links = re.findall(
                r'(http://|https://)([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?',
                javascript_text)

            if not links:
                return []

            links = [''.join(link) for link in links]
        except Exception as error:
            print("     [CRAWLING] Error while parsing links from Javascript",
                  error)

        return links

    """
        Create a parsed url (ignore javascript and html actions, remove hashes, fix relative urls etc.)
    """

    # TODO: remove index.html index.php
    def get_parsed_url(self, url):
        if url is None or url is "":
            return None

        domain = self.site["domain"]

        if not url.startswith("http"):
            # Since the chrome driver returns absolute urls, the url is most likely javascript or action

            if 'javascript:' in url:
                # This is just javascript code inside a href
                return None

            if ('mailto:' in url) or ('tel:' in url):
                # This is an action inside a href
                return None

            if url[0] is "#":
                # Link starts with a # (it's a target link)
                return None

            if url is "/":
                # This is the index page, which we already have in the frontier
                return None

            if url.startswith("www"):
                url = "http://{}".format(url).strip()
            """
                Fix relative urls just in case
                
                This function might not work correctly since it's almost impossible to know which root url the link 
                takes when it's added to the site
            """
            if url[0] == "/":
                if domain[-1] == "/":
                    # Make sure only one slash is present
                    url = url[1:]
            else:
                if domain[-1] != "/":
                    url = "/{}".format(url)

            url = "{}{}".format(domain, url).strip()

        # Remove everything after the hash
        if "#" in url:
            url = url.split("#")[0]

        # Encode special characters (the second parameter are characters that the encoder will not encode)
        url = quote(url.encode("UTF-8"), ':/-_.~&?+=')

        return url

    """
        Parse image urls
    """

    def get_parsed_image_url(self, url):
        if url is None or url is "":
            return None

        # Do not parse base64 images
        if url.startswith("data:image"):
            return None

        if not url.startswith("http"):
            # This is very unlikely, since the chrome driver returns all the image sources with absolute urls

            domain = self.site["domain"]
            """
                Fix relative urls just in case

                This function might not work correctly since it's almost impossible to know which root url the link 
                takes when it's added to the site
            """
            if url[0] == "/":
                if domain[-1] == "/":
                    # Make sure only one slash is present
                    url = url[1:]
            else:
                if domain[-1] != "/":
                    url = "/{}".format(url)

            # Create an absolute url
            url = "{}{}".format(domain, url).strip()

        return url

    """
        The duplicate page should not have the html_content value set, page_type_code should be DUPLICATE and
         that's it
    """

    def is_duplicate_page(self, html_content):

        # sha256 digest of complete html_content
        h = hash_driver.create_content_hash(html_content)

        # first check if page is exact copy of already parsed documents
        if database_handler.find_page_duplicate(h):
            return True
        else:

            # create set of hash shingles
            # in order to prevent pages using lots of same tags to be treated as similar, remove html tags
            hash_set = hash_driver.text_to_shingle_set(
                self.remove_markups(html_content))

            # hash signature will be inserted to db later
            self.current_page["hash_signature"] = hash_set

            # calculate similarity between current document and already parsed documents using Jaccard similarity
            similarity = database_handler.calculate_biggest_similarity(
                hash_set)

            #print("SIMILARITY: ", similarity)

            return similarity > MAX_SIMILARITY

    """
       Remove markup tags from html content 
    """

    def remove_markups(self, html_content):
        return BeautifulSoup(html_content, "html.parser").text

    def add_page_to_frontier_array(self, page_url):
        page_domain = self.get_domain_url(page_url)

        if ALLOWED_DOMAIN in page_domain:
            # Only add pages in the allowed domain

            self.pages_to_add_to_frontier.append({
                "from":
                self.current_page["id"],
                "to":
                page_url
            })

    def quit(self):
        self.driver.quit()
Beispiel #34
0
            # update time is old, we update it
            if rp.mtime() < (time.time() - settings.ROBOTS_TXT_CACHE):
                self.logger.info("Refresh %s/robots.txt cache" % hostname)
                try:
                    rp.read()
                except Exception, e:
                    print(e)
                    self.logger.info("Unable to get or parse %s/robots.txt" % hostname)
                    rp.disallow_all = False
                    rp.allow_all = True
            else:
                self.logger.debug("Retrieve cached %s/robots.txt" % hostname)
        else:
            # First (or very long) time we see this domain, create a new
            # RobotFileParser and read it once
            self.logger.info("First hit on %s/robots.txt" % hostname)
            rp = RobotFileParser(url="%s://%s/robots.txt" % (scheme, hostname))
            try:
                rp.read()
            except Exception, e:
                print(e)
                self.logger.info("Unable to get or parse %s/robots.txt" % hostname)
                rp.disallow_all = False
                rp.allow_all = True

        # In any case, we update the last robotstxt fetched time
        rp.modified()
        self.robotstxt[hostname] = rp

        return rp
Beispiel #35
0
 def _create_robot_file_parser(self, url):
     host = urlparse.urlsplit(url)[1]
     robots_url = urlparse.urlunsplit(('http', host, '/robots.txt', '', ''))
     rp = RobotFileParser(robots_url)
     rp.read()
     return rp