Example #1
0
	def index(self, db):
		""" Index the given database. 
		Index steps consist of:
		1, seperate the content into individual words.
		2, record each word.
		3, calculate the term frequency in current page. 
		Note: index process is time-wasting. """
		conn = sqlite3.connect(db)
		cur  = conn.cursor()
		conn.text_factory = str
		dbname = db[:-3]
		sql  = "select url from %s" % dbname
		urls = [ url[0] for url in cur.execute(sql).fetchall()]
		progress = ProgressMeter(total=len(urls))
		# traverse all webpages
		for (cnt, url) in enumerate(urls):
			urlid = self.getid('urllist','url',url)
			sql = "select content from %s where url='%s'" % (dbname, url)
			html = cur.execute(sql).fetchone()[0]
			items = self.getitems(html)
			title = replace_quote(items['title'])
			sql = "insert into urltitle values(%d,'%s')" % (urlid, title)
			self.cur.execute(sql)
			content = items['content']
			words = self.analyzer.run(content)
			tfdir = {}
			# traverse all words in current webpage
			for i in range(len(words)):
				word = words[i]
				if word not in tfdir:
					tfdir[word] = 1
				else:
					tfdir[word] += 1
				wordid = self.getid('wordlist','word',word)
				sql = "insert into wordlocation values(%d,%d,%d)" % (urlid, wordid, i)
				self.cur.execute(sql)
			for (word, tf) in tfdir.items():
				wordid = self.getid('wordlist','word',word)
				sql = "insert into wordinfo values(%d,%d,%f)" % \
					  (urlid, wordid, float(tf)/len(words))
				self.cur.execute(sql)
			# update the progress
			if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1:
				progress.update(cnt+1)
		del progress
		cur.close()	
Example #2
0
	def merge(self, tardb, srcdbs):
		""" Merge the distribute databases. 
		For sqlite does not support multi-thread very well, so each spider 
		will save its own collected pages in an individual database.
		With this funcion, it could merge the distribute databases to a single one. """
		tconn = sqlite3.connect(tardb)
		tcur  = tconn.cursor()
		sql = "create table %s (url text, content text)" % tardb[:-3]
		tcur.execute(sql)
		for src in srcdbs:
			print "Processing %s ..." % src
			sconn = sqlite3.connect(src)
			scur  = sconn.cursor()
			sconn.text_factory = str
			sql = "select * from %s " % src[:-3]
			contents = scur.execute(sql).fetchall()
			for content in contents:
				sql = "insert into %s (url, content) values('%s','%s')" % \
					  (tardb[:-3], content[0], replace_quote(content[1]))
				tcur.execute(sql)
			scur.close()
		tconn.commit()
		tconn.close()
Example #3
0
	def save_page(self, cur, url, data):
		""" Save the whole page to the database. """
		data = replace_quote(data)
		sql = "insert into %s values('%s','%s');" % (self.name, url, data)
		cur.execute(sql)