def crawl(url): global webinfo,startcount,httputil if webinfo.siteexists(url) == False: metainfo = httputil.getmeta(url) links = httputil.getlinks(url) print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links)) webinfo.savemetainfo(url, metainfo) webinfo.saveweblinks(url, links)
def crawlfromdb(): global webinfo,httputil if webinfo.getuncrawled() == False: metainfo = httputil.getmeta(url) links = httputil.getlinks(url) print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links)) webinfo.savemetainfo(url, metainfo) webinfo.saveweblinks(url, links) webinfo.markcrawled(url)
def crawl(url): global webinfo,startcount,httputil,crawledlinks,start_with_url f = open(start_with_url, 'w') f.write(url) f.close() if (startcount == 10): crawlfromdb() # instead of exit, go and crawl the rest from db. metainfo = httputil.getmeta(url) links = httputil.getlinks(url) print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links)) webinfo.savemetainfo(url, metainfo) webinfo.saveweblinks(url, links) print url+ " " + str(len(crawledlinks)) for i in links: if webinfo.siteexists(i): print url+" already crawled!!!!!!.. so skipping....." else: startcount +=1 crawl(str(i))