def startThreads(self, url, bundle_url): startTime = time.time() ADFBundle.grabBundleKeysByURL(bundle_url) print "processing... ", url new_agent = HttpAgent(url) response = new_agent.RequestResponse() #Read only the first 1MB of data snippet = response.read(50000) soup = BeautifulSoup(snippet) links = soup.findAll('a', {"target" : "_source"}) #Create an instance for each search results parse_url = urlparse(response.geturl()) hostname = parse_url.scheme + '://' + parse_url.netloc counter = 0 #populate queue with hosts for link in links: counter += 1 try: new_searchResult = SearchResult(hostname, link['href'], None) newTags, newAttrs = new_searchResult.exploreSource() #Get the CSet variables Explore.allTags = Explore.allTags.union(newTags) Explore.allAttrs = Explore.allAttrs.union(newAttrs) except: print "link unexplored" pass elapsedTime = (time.time() - startTime) print "Elapsed Time: %s" % elapsedTime
def startThreads(url, bundle_url, filename, param): elapsedTime = 0 startTime = time.time() ADFBundle.grabBundleKeysByURL(bundle_url) print "processing... ", url new_agent = HttpAgent(url) response = new_agent.RequestResponse() soup = BeautifulSoup(response) links = soup.findAll('a', {"target" : "_source"}) mf = open("outputs/" + param['container'] + "MissingBundle_" + today_date_label + "_" + filename + ".txt", 'w') f = open("outputs/" + param['container'] + "SearchResults_" + today_date_label + "_" + filename + ".txt", 'w') bf = None if param['container'] == 'dialog': headerOutputLn = ["Page Link", "Product Family", "Dialog Number", "Dialog Title", "Dialog ID", "Dialog Modal", "Dialog Parents", "Button Group Name", "# of Command Buttons", "# of CANCEL", "# of OK", "# of DONE", "# of SAVE and CLOSE", "Component Name", "Component Attributes"] print >>f, '\t'.join(headerOutputLn) elif param['container'] == 'explore': headerOutputLn = ["Page Link", "Product Family", "Tag Number", "Tag Name", "Tag Parents", "Tag Attributes"] print >>f, '\t'.join(headerOutputLn) elif param['container'] == 'icon': headerOutputLn = ["Page Link", "Product Family", "Tag Number", "Tag Name", "Tag Parents", "Attribute Name", "File Extension", "Image Source", "Original Attribute Value"] print >>f, '\t'.join(headerOutputLn) #Create an instance for each search results parse_url = urlparse(response.geturl()) hostname = parse_url.scheme + '://' + parse_url.netloc for i in range(5): t = ThreadUrl(hostname, queue, mf, f, bf, param) t.setDaemon(True) t.start() counter = 0 #populate queue with hosts for link in links: try: if param['processSize'] != 'All' and counter == int(param['processSize']): #iterate over only a few for testing purpose break except: pass counter += 1 #Add to queue queue.put(link) #wait on queue until everything has been processed queue.join() f.close() mf.close() elapsedTime = (time.time() - startTime) print "Elapsed Time: %s" % Icon.elapsedTime return elapsedTime
def startThreads(self, url, bundle_url): startTime = time.time() ADFBundle.grabBundleKeysByURL(bundle_url) print "processing... ", url new_agent = HttpAgent(url) response = new_agent.RequestResponse() #Read only the first 1MB of data snippet = response.read(50000) soup = BeautifulSoup(snippet) links = soup.findAll('a', {"target": "_source"}) #Create an instance for each search results parse_url = urlparse(response.geturl()) hostname = parse_url.scheme + '://' + parse_url.netloc counter = 0 #populate queue with hosts for link in links: counter += 1 try: new_searchResult = SearchResult(hostname, link['href'], None) newTags, newAttrs = new_searchResult.exploreSource( ) #Get the CSet variables Explore.allTags = Explore.allTags.union(newTags) Explore.allAttrs = Explore.allAttrs.union(newAttrs) except: print "link unexplored" pass elapsedTime = (time.time() - startTime) print "Elapsed Time: %s" % elapsedTime
def startThreads(url, bundle_url, filename, param): elapsedTime = 0 startTime = time.time() ADFBundle.grabBundleKeysByURL(bundle_url) print "processing... ", url new_agent = HttpAgent(url) response = new_agent.RequestResponse() soup = BeautifulSoup(response) links = soup.findAll('a', {"target": "_source"}) mf = open( "outputs/" + param['container'] + "MissingBundle_" + today_date_label + "_" + filename + ".txt", 'w') f = open( "outputs/" + param['container'] + "SearchResults_" + today_date_label + "_" + filename + ".txt", 'w') bf = None if param['container'] == 'dialog': headerOutputLn = [ "Page Link", "Product Family", "Dialog Number", "Dialog Title", "Dialog ID", "Dialog Modal", "Dialog Parents", "Button Group Name", "# of Command Buttons", "# of CANCEL", "# of OK", "# of DONE", "# of SAVE and CLOSE", "Component Name", "Component Attributes" ] print >> f, '\t'.join(headerOutputLn) elif param['container'] == 'explore': headerOutputLn = [ "Page Link", "Product Family", "Tag Number", "Tag Name", "Tag Parents", "Tag Attributes" ] print >> f, '\t'.join(headerOutputLn) elif param['container'] == 'icon': headerOutputLn = [ "Page Link", "Product Family", "Tag Number", "Tag Name", "Tag Parents", "Attribute Name", "File Extension", "Image Source", "Original Attribute Value" ] print >> f, '\t'.join(headerOutputLn) #Create an instance for each search results parse_url = urlparse(response.geturl()) hostname = parse_url.scheme + '://' + parse_url.netloc for i in range(5): t = ThreadUrl(hostname, queue, mf, f, bf, param) t.setDaemon(True) t.start() counter = 0 #populate queue with hosts for link in links: try: if param['processSize'] != 'All' and counter == int( param['processSize'] ): #iterate over only a few for testing purpose break except: pass counter += 1 #Add to queue queue.put(link) #wait on queue until everything has been processed queue.join() f.close() mf.close() elapsedTime = (time.time() - startTime) print "Elapsed Time: %s" % Icon.elapsedTime return elapsedTime