#tocrawl.put(sys.argv[1]) #tocrawl.put(sys.argv[2]) #tocrawl.put(sys.argv[3]) tocrawl = [] tocrawl.append(sys.argv[1]) tocrawl.append(sys.argv[2]) tocrawl.append(sys.argv[3]) crawled = set([]) keywordregex = re.compile('<meta\sname=["\']keywords["\']\scontent=["\'](.*?)["\']\s/>') linkregex = re.compile('<a\s*href=[\'|"](.*?)[\'"].*?>') crawlregex = re.compile #call webVisit class -- used to keep track of visited websites visit = webVisit() #DB manager mdb = DB_comms() rp = rbp.RobotFileParser() while 1: #rp = rbp.RobotFileParser() try: print 'doing something' crawling = tocrawl.pop(random.randrange(len(tocrawl))) print 'something finished' print crawling except KeyError: raise StopIteration url = urlparse.urlparse(crawling)
def __init__(self): self.connection = mdb.connect(host="192.168.0.6", user="******", passwd = "password", db = "crawlbot") self.cursor = self.connection.cursor() self.visit = webVisit()