Ejemplo n.º 1
0
	def startThreads(self, url, bundle_url):
		startTime = time.time()		
		ADFBundle.grabBundleKeysByURL(bundle_url)

		print "processing... ", url 
		new_agent = HttpAgent(url)
		response = new_agent.RequestResponse()

		#Read only the first 1MB of data
		snippet = response.read(50000)

		soup = BeautifulSoup(snippet)
		links = soup.findAll('a', {"target" : "_source"})

		#Create an instance for each search results
		parse_url = urlparse(response.geturl())
		hostname = parse_url.scheme + '://' + parse_url.netloc 

		counter = 0
		#populate queue with hosts
		for link in links:			
			counter += 1

			try:
				new_searchResult = SearchResult(hostname, link['href'], None)				
				newTags, newAttrs = new_searchResult.exploreSource() #Get the CSet variables		
				
				Explore.allTags = Explore.allTags.union(newTags)
				Explore.allAttrs = Explore.allAttrs.union(newAttrs)
			except:
				print "link unexplored"
				pass

		elapsedTime = (time.time() - startTime)
		print "Elapsed Time: %s" % elapsedTime
Ejemplo n.º 2
0
def startThreads(url, bundle_url, filename, param):
	elapsedTime = 0
	startTime = time.time()
	ADFBundle.grabBundleKeysByURL(bundle_url)

	print "processing... ", url 
	new_agent = HttpAgent(url)
	response = new_agent.RequestResponse()

	soup = BeautifulSoup(response)
	links = soup.findAll('a', {"target" : "_source"})

	mf = open("outputs/" + param['container'] + "MissingBundle_" + today_date_label + "_" + filename + ".txt", 'w')
	f = open("outputs/" + param['container'] + "SearchResults_" + today_date_label + "_" + filename + ".txt", 'w')
	bf = None

	if param['container'] == 'dialog':
		headerOutputLn = ["Page Link", "Product Family", "Dialog Number", "Dialog Title", "Dialog ID", "Dialog Modal", "Dialog Parents", "Button Group Name", "# of Command Buttons", "# of CANCEL", "# of OK", "# of DONE", "# of SAVE and CLOSE", "Component Name", "Component Attributes"]
		print >>f, '\t'.join(headerOutputLn)
	elif param['container'] == 'explore':
		headerOutputLn = ["Page Link", "Product Family", "Tag Number", "Tag Name", "Tag Parents", "Tag Attributes"]
		print >>f, '\t'.join(headerOutputLn)
	elif param['container'] == 'icon':
		headerOutputLn = ["Page Link", "Product Family", "Tag Number", "Tag Name", "Tag Parents", "Attribute Name", "File Extension", "Image Source", "Original Attribute Value"]
		print >>f, '\t'.join(headerOutputLn)

	#Create an instance for each search results
	parse_url = urlparse(response.geturl())
	hostname = parse_url.scheme + '://' + parse_url.netloc 

	for i in range(5):
		t = ThreadUrl(hostname, queue, mf, f, bf, param)
		t.setDaemon(True)
		t.start()

	counter = 0
	#populate queue with hosts
	for link in links:
		try:
			if param['processSize'] != 'All' and counter == int(param['processSize']):  #iterate over only a few for testing purpose
				break
		except:
			pass	

		counter += 1
		#Add to queue			
		queue.put(link)

	#wait on queue until everything has been processed
	queue.join()

	f.close()
	mf.close()	
	elapsedTime = (time.time() - startTime)
	print "Elapsed Time: %s" % Icon.elapsedTime
	return elapsedTime
Ejemplo n.º 3
0
    def startThreads(self, url, bundle_url):
        startTime = time.time()
        ADFBundle.grabBundleKeysByURL(bundle_url)

        print "processing... ", url
        new_agent = HttpAgent(url)
        response = new_agent.RequestResponse()

        #Read only the first 1MB of data
        snippet = response.read(50000)

        soup = BeautifulSoup(snippet)
        links = soup.findAll('a', {"target": "_source"})

        #Create an instance for each search results
        parse_url = urlparse(response.geturl())
        hostname = parse_url.scheme + '://' + parse_url.netloc

        counter = 0
        #populate queue with hosts
        for link in links:
            counter += 1

            try:
                new_searchResult = SearchResult(hostname, link['href'], None)
                newTags, newAttrs = new_searchResult.exploreSource(
                )  #Get the CSet variables

                Explore.allTags = Explore.allTags.union(newTags)
                Explore.allAttrs = Explore.allAttrs.union(newAttrs)
            except:
                print "link unexplored"
                pass

        elapsedTime = (time.time() - startTime)
        print "Elapsed Time: %s" % elapsedTime
Ejemplo n.º 4
0
def startThreads(url, bundle_url, filename, param):
    elapsedTime = 0
    startTime = time.time()
    ADFBundle.grabBundleKeysByURL(bundle_url)

    print "processing... ", url
    new_agent = HttpAgent(url)
    response = new_agent.RequestResponse()

    soup = BeautifulSoup(response)
    links = soup.findAll('a', {"target": "_source"})

    mf = open(
        "outputs/" + param['container'] + "MissingBundle_" + today_date_label +
        "_" + filename + ".txt", 'w')
    f = open(
        "outputs/" + param['container'] + "SearchResults_" + today_date_label +
        "_" + filename + ".txt", 'w')
    bf = None

    if param['container'] == 'dialog':
        headerOutputLn = [
            "Page Link", "Product Family", "Dialog Number", "Dialog Title",
            "Dialog ID", "Dialog Modal", "Dialog Parents", "Button Group Name",
            "# of Command Buttons", "# of CANCEL", "# of OK", "# of DONE",
            "# of SAVE and CLOSE", "Component Name", "Component Attributes"
        ]
        print >> f, '\t'.join(headerOutputLn)
    elif param['container'] == 'explore':
        headerOutputLn = [
            "Page Link", "Product Family", "Tag Number", "Tag Name",
            "Tag Parents", "Tag Attributes"
        ]
        print >> f, '\t'.join(headerOutputLn)
    elif param['container'] == 'icon':
        headerOutputLn = [
            "Page Link", "Product Family", "Tag Number", "Tag Name",
            "Tag Parents", "Attribute Name", "File Extension", "Image Source",
            "Original Attribute Value"
        ]
        print >> f, '\t'.join(headerOutputLn)

    #Create an instance for each search results
    parse_url = urlparse(response.geturl())
    hostname = parse_url.scheme + '://' + parse_url.netloc

    for i in range(5):
        t = ThreadUrl(hostname, queue, mf, f, bf, param)
        t.setDaemon(True)
        t.start()

    counter = 0
    #populate queue with hosts
    for link in links:
        try:
            if param['processSize'] != 'All' and counter == int(
                    param['processSize']
            ):  #iterate over only a few for testing purpose
                break
        except:
            pass

        counter += 1
        #Add to queue
        queue.put(link)

    #wait on queue until everything has been processed
    queue.join()

    f.close()
    mf.close()
    elapsedTime = (time.time() - startTime)
    print "Elapsed Time: %s" % Icon.elapsedTime
    return elapsedTime