def setUp(self):
             
     logging.basicConfig(filename='test_log.txt',
                     level=logging.DEBUG,
                     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     self.urlUtil = UrlUtil()
     self.urlAddress = 'file://' + getcwd() + '/index.html'
Exemple #2
0
    def updateDatabaseJSON(self, setId, filename, art_type, force_write):
        # Check what type is already set
        json = self.json_api.VideoLibrary.GetMovieSetDetails(
            setid=setId, properties=["art"])
        set_details = json.get("setdetails", [])
        set_art = set_details["art"]

        filename = UrlUtil.denormalise(filename, True)

        updated = 0
        if (art_type in set_art):
            existing_filename = set_art[art_type]
            log("Existing %s: %s" % (art_type, existing_filename))
            if (force_write
                    or (existing_filename.lower() != filename.lower())):
                log(
                    "Updating artwork:\nType: %s\nExisting: %s\nNew:      %s" %
                    (art_type, existing_filename, filename), xbmc.LOGDEBUG)
                self.json_api.VideoLibrary.SetMovieSetDetails(
                    setid=setId, art={art_type: filename})
                updated += 1
        else:
            log("Adding artwork:\nType: %s\nFile: %s" % (art_type, filename),
                xbmc.LOGDEBUG)
            self.json_api.VideoLibrary.SetMovieSetDetails(
                setid=setId, art={art_type: filename})
            updated += 1
        return updated
class UrlUtilityTest(unittest.TestCase):
	"""Test to check whether correct response is received and the\
 file is downloaded for URLs"""

	def setUp(self):
		module_logger.debug('method setUp was called')
		## Create the object on the tests will be performed
		self.url = UrlUtil()
		## Create a file to test upon
		file = open('urlTestFile','w+')
		file.write('This is the file for testing URL Response\n')
		file.close()
		module_logger.debug('method setUp completed successfully')


	def tearDown(self):
		module_logger.debug('method tearDown was called')
		remove('urlTestFile')
		module_logger.debug('method tearDown completed successfully')


	def testDownload(self):
		"""Testing the downloading of file from url"""
		
		module_logger.debug('method testDownload was called')
		urlAddress ='file://'+getcwd()+'/urlTestFile' ##'http://www.google.co.in/images/srpr/logo4w.png'
		self.url.downloadImage(urlAddress,'dlFile.jpg')
		self.assertTrue( path.isfile('dlFile.jpg') )
		remove('dlFile.jpg')
		module_logger.debug('method testDownload completed successfully')


	def testUrlResponse(self):
		"""Testing to get the correct url response"""
		
		module_logger.debug('method testUrlResponse was called')
		urlAddress = 'file://'+getcwd()+'/urlTestFile' ##'http://www.google.co.in/images/srpr/logo4w.png'
		self.assertTrue( self.url.getUrlResponse(urlAddress) )
		module_logger.debug('method testUrlResponse completed successfully')


	def runTest(self):
		filePointer = open('Test-Results.txt', 'a')
		suite = unittest.TestLoader().loadTestsFromTestCase(UrlUtilityTest)
		result = unittest.TextTestRunner(verbosity = 2, stream =filePointer).run(suite)
		filePointer.close()
		return result
 def setUp(self):
     module_logger.debug('method setUp was called')
     #logging.basicConfig(filename='test_log.txt',
     #                level=logging.DEBUG,
     #                format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
     self.urlUtil = UrlUtil()
     self.urlAddress = 'file://' + os.path.dirname(__file__) + '/index.html'
     module_logger.debug('method setUp completed successfully')
	def setUp(self):
		module_logger.debug('method setUp was called')
		## Create the object on the tests will be performed
		self.url = UrlUtil()
		## Create a file to test upon
		file = open('urlTestFile','w+')
		file.write('This is the file for testing URL Response\n')
		file.close()
		module_logger.debug('method setUp completed successfully')
Exemple #6
0
class WebCrawler:
    def __init__(self):
        self.url_util = UrlUtil()
        self.html_requester = HtmlRequester()
        self.html_parser = HtmlParser()

    def crawl(self, url):
        """
        Returns the URLs reachable from the parameter URL
        The assets of each URL are also returned.
        Only URLs with the same hostname including subdomain as the parameter URL are returned.
        """

        url = self.url_util.normalise_url(url)
        hostname = self.url_util.get_hostname(url)

        urlsToVisit = [url]
        urlsVisted = []
        output = []
        # Each iteration of this loop processes the next URL to visit.
        while (len(urlsToVisit) > 0):

            url = urlsToVisit.pop(0)
            urlsVisted.append(url)

            html = self.html_requester.get_html(url)
            links = self.html_parser.get_links(html)
            same_hostname_urls = self.html_parser.get_same_hostname_urls(
                hostname, links)
            assets = self.html_parser.get_assets(same_hostname_urls)
            web_pages = self.html_parser.get_web_pages(same_hostname_urls)

            output.append({"url": url, "assets": assets})
            print json.dumps({"url": url, "assets": assets}, indent=4)

            for web_page in web_pages:
                # Do not visit a page more than once
                if not web_page in urlsToVisit and web_page not in urlsVisted:
                    urlsToVisit.append(web_page)

        return json.dumps(output, indent=4).splitlines()
Exemple #7
0
 def handle_register(self):
     email = self.request.get('email')
     passwd = self.request.get('passwd')
     if not email or not passwd:
         return False
     entity = Email.create(email, passwd)
     if not entity:
         return False, '%s has been registered' % email
     url = UrlUtil.urlencode('/confirm_register', {
         'key': entity.key.urlsafe(),
         'passwd': passwd
     })
     self.send_email(
         email, 'Register confirm mail', """Please click below url to login:
             %s
         """ % ('https://selfcontrolboard.appspot.com' + url))
     if self.is_test_env():
         return True, 'http://localhost:8080' + url
     else:
         return True, 'ok'
Exemple #8
0
    def updateDatabaseJSON(self, setId, filename, art_type, force_write):
        # Check what type is already set
        json = self.json_api.VideoLibrary.GetMovieSetDetails( setid=setId, properties=["art"] )
        set_details = json.get( "setdetails", [] )
        set_art = set_details["art"]

        filename = UrlUtil.denormalise(filename, True)
        
        updated = 0
        if ( art_type in set_art ):
            existing_filename = set_art[art_type]
            log ( "Existing %s: %s" % (art_type, existing_filename) )
            if ( force_write or (existing_filename.lower() != filename.lower()) ):
                log( "Updating artwork:\nType: %s\nExisting: %s\nNew:      %s" % (art_type, existing_filename, filename), xbmc.LOGDEBUG )
                self.json_api.VideoLibrary.SetMovieSetDetails( setid=setId, art={art_type: filename} )
                updated += 1
        else:
            log( "Adding artwork:\nType: %s\nFile: %s" % (art_type, filename), xbmc.LOGDEBUG )
            self.json_api.VideoLibrary.SetMovieSetDetails( setid=setId, art={art_type: filename} )
            updated += 1
        return updated
class ScrapyExtractorTest(unittest.TestCase):
    """Testing for extraction of different kind of objects"""

    def setUp(self):
        module_logger.debug('method setUp was called')
        #logging.basicConfig(filename='test_log.txt',
        #                level=logging.DEBUG,
        #                format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.urlUtil = UrlUtil()
        self.urlAddress = 'file://' + os.path.dirname(__file__) + '/index.html'
        module_logger.debug('method setUp completed successfully')


    ##(i) Image extraction test
    def testImageExtract(self):
        """Testing for image extraction from given page"""
        
        module_logger.debug('method testImageExtract was called')
        #Initialize the extractor
        configFile = os.path.dirname(__file__) + '/imageExtractionTest.yml'
        extractor = ScrapyExtractor(configFile, self.urlUtil)  
        
        #Create an image item
        item = ImageArrayItem() 
        item.init()
                
        #Create an HtmlResponse object for performing XPath operations on it
        bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress)
        response = HtmlResponse(self.urlAddress, body=bodyForResponse)
        
        #Extract images from the HtmlResponse object
        extractedData = extractor.extract(response, item)
        
        #Load the correct data and verify it with extracted 
        trueData = json.load(open(os.path.dirname(__file__) + '/correctImageData','r'))
        self.assertTrue(extractedData == trueData)
        #print extractedData
        module_logger.debug('method testImageExtract completed successfully')
                
    ##(ii) Text extraction test
    def testTextExtract(self):
        """Testing for text extraction from given page"""
        
        module_logger.debug('method testTextExtract was called')
        #Initialize the extractor
        configFile = os.path.dirname(__file__) + '/textExtractionTest.yml'
        extractor = ScrapyExtractor(configFile, self.urlUtil)  
        
        #Create a text item
        item = TextArrayItem() 
        #item.init()
        
        #Create an HtmlResponse object for performing XPath operations on it
        bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress)
        response = HtmlResponse(self.urlAddress, body=bodyForResponse)
        
        #Extract text from the HtmlResponse object
        extractedData = extractor.extract(response, item)
        
        #Load the correct data and verify it with extracted
        #print extractedData
        trueData = json.load(open(os.path.dirname(__file__) + '/correctTextData','r'))
        self.assertTrue(extractedData == trueData)
        module_logger.debug('method testTextExtract completed successfully')

        
    #(iii) Link Extraction Test
    def testLinkExtract(self):
        """Testing for recursive link extraction from given page and following it"""
        
        module_logger.debug('method testLinkExtract was called')
        #Initialize the extractor
        configFile = os.path.dirname(__file__) + '/linkExtractionTest.yml'
        extractor = ScrapyExtractor(configFile, self.urlUtil)  
     
        #Create a link item
        item = LinkArrayItem()
        item.init()
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
        #Create an HtmlResponse object for performing XPath operations on it
        bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress)
        response = HtmlResponse(self.urlAddress, body=bodyForResponse)
        
        #Extract links from the HtmlResponse object
        extractedData = extractor.extract(response, item)
        
        #Load the correct data and verify it with extracted
        #print extractedData
        trueData = json.load(open(os.path.dirname(__file__) + '/correctLinkData','r'))
        self.assertTrue(extractedData == trueData)
        module_logger.debug('method testLinkExtract completed successfully')


    def runTest(self):
        filePointer = open(os.pardir + '/Logger/Test-Results.txt', 'a')
        suite = unittest.TestLoader().loadTestsFromTestCase(ScrapyExtractorTest)
        result = unittest.TextTestRunner(verbosity=2, stream=filePointer).run(suite)     
        filePointer.close()
        return result
Exemple #10
0
 def __init__(self):
     self.url_util = UrlUtil()
     self.html_requester = HtmlRequester()
     self.html_parser = HtmlParser()
Exemple #11
0
 def __init__(self):
     self.url_util = UrlUtil()
Exemple #12
0
class HtmlParser:
    def __init__(self):
        self.url_util = UrlUtil()

    def get_links(self, html):
        """
        Extracts and returns the links in the parameter HTML
        """
        links = []
        link_prefixes = ["href=\"", "src=\""]
        for line in html:
            link_found = True
            # Each iteration of this loop finds a link in the line.
            while link_found:
                link_found = False
                start = len(line)
                end = len(line)
                # The next link in the line could have any link_prefix.
                for link_prefix in link_prefixes:
                    if link_prefix in line:
                        link_found = True
                        link_prefix_start = line.index(link_prefix) + len(
                            link_prefix)
                        link_prefix_end = line.index("\"", link_prefix_start)
                        # If the next link in the line prefixed by link_prefix is the closest to the start of line so far
                        if link_prefix_start < start:
                            start = link_prefix_start
                            end = link_prefix_end
                if link_found:
                    links.append(line[start:end])
                    line = line[end:]
        return links

    def get_same_hostname_urls(self, hostname, links):
        """
        Returns the links from the parameter links that have the same hostname as the parameter hostname
        The links returned are converted into absolute urls.
        """

        hostname = self.url_util.normalise_url(hostname)

        same_hostname_urls = []
        for link in links:

            # Normalise link
            if link.endswith("/"):
                link = link.rstrip("/")
            if link.startswith("https://"):
                link = link.lstrip("https://")
                link = "http://" + link
            if link.startswith("//"):
                link = "http:" + link

            if link.startswith(hostname):
                # Link starts with hostname.
                same_hostname_urls.append(link)
            elif not link.startswith("http://"):
                # Link is relative, so prefix the link with hostname
                if link != "" and not link.startswith("/"):
                    link = "/" + link
                same_hostname_urls.append(hostname + link)

        return same_hostname_urls

    def get_assets(self, urls):
        """
        Returns the urls from the parameter urls that refer to assets
        """
        assets = []
        asset_extensions = self.get_asset_extensions()
        for url in urls:
            for asset_extension in asset_extensions:
                if url.endswith(asset_extension):
                    # Assets should be unique.
                    if url not in assets:
                        assets.append(url)
                    break
        return assets

    def get_web_pages(self, urls):
        """
        Returns the urls from the parameter urls that refer to web pages
        """
        web_pages = []
        asset_extensions = self.get_asset_extensions()
        for url in urls:
            is_web_page = True
            for asset_extension in asset_extensions:
                if url.endswith(asset_extension):
                    is_web_page = False
                    break
            # Web pages should be unique.
            if is_web_page and url not in web_pages:
                web_pages.append(url)
        return web_pages

    def get_asset_extensions(self):
        """
        Returns file extensions for assets
        """
        asset_extensions = []
        file_path = os.path.join(os.path.dirname(__file__), 'resources',
                                 'asset_extensions.txt')
        with open(file_path, 'r') as file:
            for line in file:
                asset_extensions.append(line.rstrip('\n'))
        return asset_extensions
class Test():
#class Test(unittest.TestCase):
    """Testing for extraction of different kind of objects"""

    def setUp(self):
                
        logging.basicConfig(filename='test_log.txt',
                        level=logging.DEBUG,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.urlUtil = UrlUtil()
        self.urlAddress = 'file://' + getcwd() + '/index.html'
    
    
    ##(i) Image extraction test
    def testImageExtract(self):
        """Testing for image extraction from given page"""
        
        #Initialize the extractor
        configFile = 'imageExtractionTest.yml'
        extractor = ScrapyExtractor(configFile, self.urlUtil)  
        
        #Create an image item
        item = ImageArrayItem() 
        item.init()
                
        #Create an HtmlResponse object for performing XPath operations on it
        bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress)
        response = HtmlResponse(self.urlAddress, body=bodyForResponse)
        
        #Extract images from the HtmlResponse object
        extractedData = extractor.extract(response, item)
        
        #Load the correct data and verify it with extracted 
        trueData = json.load(open('correctImageData','r'))
        self.assertTrue(extractedData == trueData)
        #print extractedData
                
    ##(ii) Text extraction test
    def testTextExtract(self):
        """Testing for text extraction from given page"""
        
        #Initialize the extractor
        configFile = 'textExtractionTest.yml'
        extractor = ScrapyExtractor(configFile, self.urlUtil)  
        
        #Create a text item
        item = TextArrayItem() 
        #item.init()
        
        #Create an HtmlResponse object for performing XPath operations on it
        bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress)
        response = HtmlResponse(self.urlAddress, body=bodyForResponse)
        
        #Extract text from the HtmlResponse object
        extractedData = extractor.extract(response, item)
        
        #Load the correct data and verify it with extracted
        #print extractedData
        trueData = json.load(open('correctTextData','r'))
        self.assertTrue(extractedData == trueData)
        
        
    #(iii) Link Extraction Test
    def testLinkExtract(self):
        """Testing for recursive link extraction from given page and following it"""
        
        #Initialize the extractor
        configFile = 'linkExtractionTest.yml'
        extractor = ScrapyExtractor(configFile, self.urlUtil)  
     
        #Create a link item
        item = LinkArrayItem()
        item.init()
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    
        #Create an HtmlResponse object for performing XPath operations on it
        bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress)
        response = HtmlResponse(self.urlAddress, body=bodyForResponse)
        
        #Extract links from the HtmlResponse object
        extractedData = extractor.extract(response, item)
        
        #Load the correct data and verify it with extracted
        print extractedData