#tocrawl.put(sys.argv[3]) tocrawl = [] tocrawl.append(sys.argv[1]) tocrawl.append(sys.argv[2]) tocrawl.append(sys.argv[3]) crawled = set([]) keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>') linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>') crawlregex = re.compile #call webVisit class -- used to keep track of visited websites visit = webVisit() #DB manager mdb = DB_comms() rp = rbp.RobotFileParser() while 1: #rp = rbp.RobotFileParser() try: print 'doing something' crawling = tocrawl.pop(random.randrange(len(tocrawl))) print 'something finished' print crawling except KeyError: raise StopIteration url = urlparse.urlparse(crawling) #get website location on www site_url = url.netloc
import robotparser as rbp tocrawl = Queue() tocrawl.put(sys.argv[1]) tocrawl.put(sys.argv[2]) tocrawl.put(sys.argv[3]) crawled = set([]) keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>') linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>') crawlregex = re.compile #call webVisit class -- used to keep track of visited websites visit = webVisit() #DB manager mdb = DB_comms() while 1: #this is here due to a bug (maybe not a bug) where we couldn't set the url to a different website after one had been set #for example, facebook.com would return false, as it should, but then switching to reddit.com, it would still return false #which it shouldn't rp = rbp.RobotFileParser() try: crawling = tocrawl.get() print "\t\t\t out of queue: " + crawling except KeyError: raise StopIteration url = urlparse.urlparse(crawling)