def traverse_inner(list, pattern, depth, count): for url in list: if re.match(pattern, url) and (not UrlDbHelper.inCheckedTable(url)): try: UrlDbHelper.addToCheckedTable(url) count += 1 print " "*depth + url + " depth:" + str(depth) except: print "error adding url:" + url if count % 100 == 0: UrlDbHelper.commit() if depth < MAX_DEPTH: try: result = UrlFetcher.fetch_all_links(url) traverse_inner(result, pattern, depth+1, count) except: continue return
def traverse(root, pattern): UrlDbHelper.initDB() count = 0 traverse_inner([root,], pattern, count, 0) UrlDbHelper.commit() print str(count) + " tuples have been added"