Beispiel #1
0
	def build_links(self, db):
		""" Analyze the original page, and rebulid the link-relationship. """
		print "Building links' connections."
		conn = sqlite3.connect(db)
		cur  = conn.cursor()
		conn.text_factory = str
		dbname = db[:-3]
		sql  = "select url from %s" % dbname
		urls = [ url[0] for url in cur.execute(sql).fetchall()]
		
		urlids    = self.urls2ids(urls)
		from_urls = dict([(urlid,[]) for urlid in urlids])
		to_urls   = dict([(urlid,[]) for urlid in urlids])

		progress = ProgressMeter(total=len(urls))
		for (cnt, url) in enumerate(urls):
			urlid = self.get_urlid(url)
			p = MyHTMLParser(url)
			sql = "select content from %s where url='%s'" % (dbname, url)
			content = cur.execute(sql).fetchone()[0]
			try:    p.feed(content)
			except:	ferrmsg('Error: feed error in %s.' % url, 'Rank')
			to_urls[urlid] = self.urls2ids(p.htm_urls())
			for lid in to_urls[urlid]:
				if lid not in from_urls.keys():
					continue
				else:
					from_urls[lid].append(urlid)
			# update the progress
			if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1:
				progress.update(cnt+1)
		self.url_ids  = urlids
		self.from_ids = from_urls
		self.to_ids   = to_urls
Beispiel #2
0
	def getitems(self, html):
		""" Analyze the original webpage, and extract the valuable info.
		Here only extract the page title and all page contents. """
		try:
			p = Parser()
			p.feed(html)
		except:
			ferrmsg('Error: feed error!', 'Index')
		items = {}
	   	title = p.get_title()
		items['title'] = title
		content = p.get_content()
		items['content'] = content
		return items
Beispiel #3
0
	def download(self):
		""" Download a given url's page. """
		try:
			request = urllib2.Request(url=self.url, headers=self.headers)
			page = self.opener.open(request)
			if page.code == 200:
				gzipdata = page.read()
				gzipstream = StringIO(gzipdata)
				try:
					self.data  = gzip.GzipFile(fileobj=gzipstream).read()
				except IOError:
					self.data = gzipdata
		except:
			ferrmsg('Error: invalid URL "%s"' % self.url, 'Spider')
			self.data = RET_ERROR
		return self.data
Beispiel #4
0
	def query(self, q, op='and'):
		""" Query and rank the results. Calculate the scores according to 
		both factors of the content rank and page rank. 
		Note: In this function, calculate the content rank scores in the real time
		and only get the page rank scores which are preloaded in the database. """
		db   = self.config.indexdb
		sort = int(self.config.sort)
		cr_fac = float(self.config.rankers['content'])
		pr_fac = float(self.config.rankers['page'])

		query = SimpleQuery(db)
		words = query.parse_query(q)
		urls  = query.query(q)
		
		if len(urls) == 0:
			return []

		scores = {}
		valid_fac   = {}
		valid_score = {}
		if cr_fac > 0:
			cr = ContentRanker(db, sort)
			cr_scores = cr.score(urls, words)
			cr_scores = normalize(cr_scores)
			valid_fac['content'] = cr_fac
			valid_score['content'] = cr_scores

		if pr_fac > 0:
			pr = PageRanker(db, sort)
			pr_scores = pr.score(urls, words)
			pr_scores = normalize(pr_scores)
			valid_fac['page'] = pr_fac
			valid_score['page'] = pr_scores

		for urlid in urls:
			scores[urlid] = 0.0
			try:
				for key in valid_fac.keys():
					scores[urlid] += valid_fac[key]*valid_score[key][urlid]
			except:
				ferrmsg("Error: urlid(%s) is not find in the results of each rank." % urlid, \
						'SECore')
				
		res = sorted(scores.items(), key=lambda v:v[1], reverse=sort)
		res = [t[0] for t in res]
		return res
Beispiel #5
0
	def get_page(self, url):
		""" Download the page and analyze all the urls in this page. """
		rer = Retriever(url, self.headers)
		retval = rer.download()	
		if retval is RET_ERROR:
			return retval
		self.num_gets += 1
		if url not in self.bloom:
			self.bloom.add(url)
		
		p = MyHTMLParser(url)
		try:
			p.feed(retval)
		except:
			ferrmsg('Error: feed error in url: %s' % url, 'Spider')
		for link in p.htm_urls():
			if (link not in self.bloom) and (link not in self.queue.queue):
				self.queue.put(link)
		return retval