Beispiel #1
0
def crawl(url):
        global webinfo,startcount,httputil
	if webinfo.siteexists(url) == False:
        	metainfo = httputil.getmeta(url)
        	links = httputil.getlinks(url)
        	print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links))
        	webinfo.savemetainfo(url, metainfo)
        	webinfo.saveweblinks(url, links)
Beispiel #2
0
def crawlfromdb():
        global webinfo,httputil
        if webinfo.getuncrawled() == False:
                metainfo = httputil.getmeta(url)
                links = httputil.getlinks(url)
                print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links))
                webinfo.savemetainfo(url, metainfo)
                webinfo.saveweblinks(url, links)
		webinfo.markcrawled(url)
Beispiel #3
0
def crawl(url):
	global webinfo,startcount,httputil,crawledlinks,start_with_url
	f = open(start_with_url, 'w')
	f.write(url)
	f.close()
	if (startcount == 10):
		crawlfromdb() # instead of exit, go and crawl the rest from db.
	metainfo = httputil.getmeta(url)
	links = httputil.getlinks(url)
	print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links))
	webinfo.savemetainfo(url, metainfo)
	webinfo.saveweblinks(url, links)
	print url+ " " + str(len(crawledlinks))
	for i in links:
		if webinfo.siteexists(i):
			print url+" already crawled!!!!!!.. so skipping....."	
		else:
			startcount +=1
			crawl(str(i))