def setUp(self): logging.basicConfig(filename='test_log.txt', level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.urlUtil = UrlUtil() self.urlAddress = 'file://' + getcwd() + '/index.html'
def updateDatabaseJSON(self, setId, filename, art_type, force_write): # Check what type is already set json = self.json_api.VideoLibrary.GetMovieSetDetails( setid=setId, properties=["art"]) set_details = json.get("setdetails", []) set_art = set_details["art"] filename = UrlUtil.denormalise(filename, True) updated = 0 if (art_type in set_art): existing_filename = set_art[art_type] log("Existing %s: %s" % (art_type, existing_filename)) if (force_write or (existing_filename.lower() != filename.lower())): log( "Updating artwork:\nType: %s\nExisting: %s\nNew: %s" % (art_type, existing_filename, filename), xbmc.LOGDEBUG) self.json_api.VideoLibrary.SetMovieSetDetails( setid=setId, art={art_type: filename}) updated += 1 else: log("Adding artwork:\nType: %s\nFile: %s" % (art_type, filename), xbmc.LOGDEBUG) self.json_api.VideoLibrary.SetMovieSetDetails( setid=setId, art={art_type: filename}) updated += 1 return updated
class UrlUtilityTest(unittest.TestCase): """Test to check whether correct response is received and the\ file is downloaded for URLs""" def setUp(self): module_logger.debug('method setUp was called') ## Create the object on the tests will be performed self.url = UrlUtil() ## Create a file to test upon file = open('urlTestFile','w+') file.write('This is the file for testing URL Response\n') file.close() module_logger.debug('method setUp completed successfully') def tearDown(self): module_logger.debug('method tearDown was called') remove('urlTestFile') module_logger.debug('method tearDown completed successfully') def testDownload(self): """Testing the downloading of file from url""" module_logger.debug('method testDownload was called') urlAddress ='file://'+getcwd()+'/urlTestFile' ##'http://www.google.co.in/images/srpr/logo4w.png' self.url.downloadImage(urlAddress,'dlFile.jpg') self.assertTrue( path.isfile('dlFile.jpg') ) remove('dlFile.jpg') module_logger.debug('method testDownload completed successfully') def testUrlResponse(self): """Testing to get the correct url response""" module_logger.debug('method testUrlResponse was called') urlAddress = 'file://'+getcwd()+'/urlTestFile' ##'http://www.google.co.in/images/srpr/logo4w.png' self.assertTrue( self.url.getUrlResponse(urlAddress) ) module_logger.debug('method testUrlResponse completed successfully') def runTest(self): filePointer = open('Test-Results.txt', 'a') suite = unittest.TestLoader().loadTestsFromTestCase(UrlUtilityTest) result = unittest.TextTestRunner(verbosity = 2, stream =filePointer).run(suite) filePointer.close() return result
def setUp(self): module_logger.debug('method setUp was called') #logging.basicConfig(filename='test_log.txt', # level=logging.DEBUG, # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.urlUtil = UrlUtil() self.urlAddress = 'file://' + os.path.dirname(__file__) + '/index.html' module_logger.debug('method setUp completed successfully')
def setUp(self): module_logger.debug('method setUp was called') ## Create the object on the tests will be performed self.url = UrlUtil() ## Create a file to test upon file = open('urlTestFile','w+') file.write('This is the file for testing URL Response\n') file.close() module_logger.debug('method setUp completed successfully')
class WebCrawler: def __init__(self): self.url_util = UrlUtil() self.html_requester = HtmlRequester() self.html_parser = HtmlParser() def crawl(self, url): """ Returns the URLs reachable from the parameter URL The assets of each URL are also returned. Only URLs with the same hostname including subdomain as the parameter URL are returned. """ url = self.url_util.normalise_url(url) hostname = self.url_util.get_hostname(url) urlsToVisit = [url] urlsVisted = [] output = [] # Each iteration of this loop processes the next URL to visit. while (len(urlsToVisit) > 0): url = urlsToVisit.pop(0) urlsVisted.append(url) html = self.html_requester.get_html(url) links = self.html_parser.get_links(html) same_hostname_urls = self.html_parser.get_same_hostname_urls( hostname, links) assets = self.html_parser.get_assets(same_hostname_urls) web_pages = self.html_parser.get_web_pages(same_hostname_urls) output.append({"url": url, "assets": assets}) print json.dumps({"url": url, "assets": assets}, indent=4) for web_page in web_pages: # Do not visit a page more than once if not web_page in urlsToVisit and web_page not in urlsVisted: urlsToVisit.append(web_page) return json.dumps(output, indent=4).splitlines()
def handle_register(self): email = self.request.get('email') passwd = self.request.get('passwd') if not email or not passwd: return False entity = Email.create(email, passwd) if not entity: return False, '%s has been registered' % email url = UrlUtil.urlencode('/confirm_register', { 'key': entity.key.urlsafe(), 'passwd': passwd }) self.send_email( email, 'Register confirm mail', """Please click below url to login: %s """ % ('https://selfcontrolboard.appspot.com' + url)) if self.is_test_env(): return True, 'http://localhost:8080' + url else: return True, 'ok'
def updateDatabaseJSON(self, setId, filename, art_type, force_write): # Check what type is already set json = self.json_api.VideoLibrary.GetMovieSetDetails( setid=setId, properties=["art"] ) set_details = json.get( "setdetails", [] ) set_art = set_details["art"] filename = UrlUtil.denormalise(filename, True) updated = 0 if ( art_type in set_art ): existing_filename = set_art[art_type] log ( "Existing %s: %s" % (art_type, existing_filename) ) if ( force_write or (existing_filename.lower() != filename.lower()) ): log( "Updating artwork:\nType: %s\nExisting: %s\nNew: %s" % (art_type, existing_filename, filename), xbmc.LOGDEBUG ) self.json_api.VideoLibrary.SetMovieSetDetails( setid=setId, art={art_type: filename} ) updated += 1 else: log( "Adding artwork:\nType: %s\nFile: %s" % (art_type, filename), xbmc.LOGDEBUG ) self.json_api.VideoLibrary.SetMovieSetDetails( setid=setId, art={art_type: filename} ) updated += 1 return updated
class ScrapyExtractorTest(unittest.TestCase): """Testing for extraction of different kind of objects""" def setUp(self): module_logger.debug('method setUp was called') #logging.basicConfig(filename='test_log.txt', # level=logging.DEBUG, # format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.urlUtil = UrlUtil() self.urlAddress = 'file://' + os.path.dirname(__file__) + '/index.html' module_logger.debug('method setUp completed successfully') ##(i) Image extraction test def testImageExtract(self): """Testing for image extraction from given page""" module_logger.debug('method testImageExtract was called') #Initialize the extractor configFile = os.path.dirname(__file__) + '/imageExtractionTest.yml' extractor = ScrapyExtractor(configFile, self.urlUtil) #Create an image item item = ImageArrayItem() item.init() #Create an HtmlResponse object for performing XPath operations on it bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress) response = HtmlResponse(self.urlAddress, body=bodyForResponse) #Extract images from the HtmlResponse object extractedData = extractor.extract(response, item) #Load the correct data and verify it with extracted trueData = json.load(open(os.path.dirname(__file__) + '/correctImageData','r')) self.assertTrue(extractedData == trueData) #print extractedData module_logger.debug('method testImageExtract completed successfully') ##(ii) Text extraction test def testTextExtract(self): """Testing for text extraction from given page""" module_logger.debug('method testTextExtract was called') #Initialize the extractor configFile = os.path.dirname(__file__) + '/textExtractionTest.yml' extractor = ScrapyExtractor(configFile, self.urlUtil) #Create a text item item = TextArrayItem() #item.init() #Create an HtmlResponse object for performing XPath operations on it bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress) response = HtmlResponse(self.urlAddress, body=bodyForResponse) #Extract text from the HtmlResponse object extractedData = extractor.extract(response, item) #Load the correct data and verify it with extracted #print extractedData trueData = json.load(open(os.path.dirname(__file__) + '/correctTextData','r')) self.assertTrue(extractedData == trueData) module_logger.debug('method testTextExtract completed successfully') #(iii) Link Extraction Test def testLinkExtract(self): """Testing for recursive link extraction from given page and following it""" module_logger.debug('method testLinkExtract was called') #Initialize the extractor configFile = os.path.dirname(__file__) + '/linkExtractionTest.yml' extractor = ScrapyExtractor(configFile, self.urlUtil) #Create a link item item = LinkArrayItem() item.init() #Create an HtmlResponse object for performing XPath operations on it bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress) response = HtmlResponse(self.urlAddress, body=bodyForResponse) #Extract links from the HtmlResponse object extractedData = extractor.extract(response, item) #Load the correct data and verify it with extracted #print extractedData trueData = json.load(open(os.path.dirname(__file__) + '/correctLinkData','r')) self.assertTrue(extractedData == trueData) module_logger.debug('method testLinkExtract completed successfully') def runTest(self): filePointer = open(os.pardir + '/Logger/Test-Results.txt', 'a') suite = unittest.TestLoader().loadTestsFromTestCase(ScrapyExtractorTest) result = unittest.TextTestRunner(verbosity=2, stream=filePointer).run(suite) filePointer.close() return result
def __init__(self): self.url_util = UrlUtil() self.html_requester = HtmlRequester() self.html_parser = HtmlParser()
def __init__(self): self.url_util = UrlUtil()
class HtmlParser: def __init__(self): self.url_util = UrlUtil() def get_links(self, html): """ Extracts and returns the links in the parameter HTML """ links = [] link_prefixes = ["href=\"", "src=\""] for line in html: link_found = True # Each iteration of this loop finds a link in the line. while link_found: link_found = False start = len(line) end = len(line) # The next link in the line could have any link_prefix. for link_prefix in link_prefixes: if link_prefix in line: link_found = True link_prefix_start = line.index(link_prefix) + len( link_prefix) link_prefix_end = line.index("\"", link_prefix_start) # If the next link in the line prefixed by link_prefix is the closest to the start of line so far if link_prefix_start < start: start = link_prefix_start end = link_prefix_end if link_found: links.append(line[start:end]) line = line[end:] return links def get_same_hostname_urls(self, hostname, links): """ Returns the links from the parameter links that have the same hostname as the parameter hostname The links returned are converted into absolute urls. """ hostname = self.url_util.normalise_url(hostname) same_hostname_urls = [] for link in links: # Normalise link if link.endswith("/"): link = link.rstrip("/") if link.startswith("https://"): link = link.lstrip("https://") link = "http://" + link if link.startswith("//"): link = "http:" + link if link.startswith(hostname): # Link starts with hostname. same_hostname_urls.append(link) elif not link.startswith("http://"): # Link is relative, so prefix the link with hostname if link != "" and not link.startswith("/"): link = "/" + link same_hostname_urls.append(hostname + link) return same_hostname_urls def get_assets(self, urls): """ Returns the urls from the parameter urls that refer to assets """ assets = [] asset_extensions = self.get_asset_extensions() for url in urls: for asset_extension in asset_extensions: if url.endswith(asset_extension): # Assets should be unique. if url not in assets: assets.append(url) break return assets def get_web_pages(self, urls): """ Returns the urls from the parameter urls that refer to web pages """ web_pages = [] asset_extensions = self.get_asset_extensions() for url in urls: is_web_page = True for asset_extension in asset_extensions: if url.endswith(asset_extension): is_web_page = False break # Web pages should be unique. if is_web_page and url not in web_pages: web_pages.append(url) return web_pages def get_asset_extensions(self): """ Returns file extensions for assets """ asset_extensions = [] file_path = os.path.join(os.path.dirname(__file__), 'resources', 'asset_extensions.txt') with open(file_path, 'r') as file: for line in file: asset_extensions.append(line.rstrip('\n')) return asset_extensions
class Test(): #class Test(unittest.TestCase): """Testing for extraction of different kind of objects""" def setUp(self): logging.basicConfig(filename='test_log.txt', level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') self.urlUtil = UrlUtil() self.urlAddress = 'file://' + getcwd() + '/index.html' ##(i) Image extraction test def testImageExtract(self): """Testing for image extraction from given page""" #Initialize the extractor configFile = 'imageExtractionTest.yml' extractor = ScrapyExtractor(configFile, self.urlUtil) #Create an image item item = ImageArrayItem() item.init() #Create an HtmlResponse object for performing XPath operations on it bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress) response = HtmlResponse(self.urlAddress, body=bodyForResponse) #Extract images from the HtmlResponse object extractedData = extractor.extract(response, item) #Load the correct data and verify it with extracted trueData = json.load(open('correctImageData','r')) self.assertTrue(extractedData == trueData) #print extractedData ##(ii) Text extraction test def testTextExtract(self): """Testing for text extraction from given page""" #Initialize the extractor configFile = 'textExtractionTest.yml' extractor = ScrapyExtractor(configFile, self.urlUtil) #Create a text item item = TextArrayItem() #item.init() #Create an HtmlResponse object for performing XPath operations on it bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress) response = HtmlResponse(self.urlAddress, body=bodyForResponse) #Extract text from the HtmlResponse object extractedData = extractor.extract(response, item) #Load the correct data and verify it with extracted #print extractedData trueData = json.load(open('correctTextData','r')) self.assertTrue(extractedData == trueData) #(iii) Link Extraction Test def testLinkExtract(self): """Testing for recursive link extraction from given page and following it""" #Initialize the extractor configFile = 'linkExtractionTest.yml' extractor = ScrapyExtractor(configFile, self.urlUtil) #Create a link item item = LinkArrayItem() item.init() #Create an HtmlResponse object for performing XPath operations on it bodyForResponse = self.urlUtil.getUrlResponse(self.urlAddress) response = HtmlResponse(self.urlAddress, body=bodyForResponse) #Extract links from the HtmlResponse object extractedData = extractor.extract(response, item) #Load the correct data and verify it with extracted print extractedData