Exemple #1
0
import urllib2
import sys
import httputil
from store import webinfo
crawledlinks = []
startcount = 0
start_with_url = 'start_with_url.txt' 
httputil = httputil.util()
webinfo = webinfo.ds()
def main(argv=sys.argv):
	if (len(argv) == 1):
		f = open(start_with_url, 'r')
		newurl = f.read()
		f.close()
	else:
		newurl = argv[1]
	crawl(str(newurl))

def crawlfromdb():
        global webinfo,httputil
        if webinfo.getuncrawled() == False:
                metainfo = httputil.getmeta(url)
                links = httputil.getlinks(url)
                print "meta data size :"+str(len(metainfo))+" links size : "+str(len(links))
                webinfo.savemetainfo(url, metainfo)
                webinfo.saveweblinks(url, links)
		webinfo.markcrawled(url)
def crawl(url):
	global webinfo,startcount,httputil,crawledlinks,start_with_url
	f = open(start_with_url, 'w')
	f.write(url)
Exemple #2
0
	def setUp(self):
		self.httputil = httputil.util()