def build_links(self, db): """ Analyze the original page, and rebulid the link-relationship. """ print "Building links' connections." conn = sqlite3.connect(db) cur = conn.cursor() conn.text_factory = str dbname = db[:-3] sql = "select url from %s" % dbname urls = [ url[0] for url in cur.execute(sql).fetchall()] urlids = self.urls2ids(urls) from_urls = dict([(urlid,[]) for urlid in urlids]) to_urls = dict([(urlid,[]) for urlid in urlids]) progress = ProgressMeter(total=len(urls)) for (cnt, url) in enumerate(urls): urlid = self.get_urlid(url) p = MyHTMLParser(url) sql = "select content from %s where url='%s'" % (dbname, url) content = cur.execute(sql).fetchone()[0] try: p.feed(content) except: ferrmsg('Error: feed error in %s.' % url, 'Rank') to_urls[urlid] = self.urls2ids(p.htm_urls()) for lid in to_urls[urlid]: if lid not in from_urls.keys(): continue else: from_urls[lid].append(urlid) # update the progress if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1: progress.update(cnt+1) self.url_ids = urlids self.from_ids = from_urls self.to_ids = to_urls
def getitems(self, html): """ Analyze the original webpage, and extract the valuable info. Here only extract the page title and all page contents. """ try: p = Parser() p.feed(html) except: ferrmsg('Error: feed error!', 'Index') items = {} title = p.get_title() items['title'] = title content = p.get_content() items['content'] = content return items
def download(self): """ Download a given url's page. """ try: request = urllib2.Request(url=self.url, headers=self.headers) page = self.opener.open(request) if page.code == 200: gzipdata = page.read() gzipstream = StringIO(gzipdata) try: self.data = gzip.GzipFile(fileobj=gzipstream).read() except IOError: self.data = gzipdata except: ferrmsg('Error: invalid URL "%s"' % self.url, 'Spider') self.data = RET_ERROR return self.data
def query(self, q, op='and'): """ Query and rank the results. Calculate the scores according to both factors of the content rank and page rank. Note: In this function, calculate the content rank scores in the real time and only get the page rank scores which are preloaded in the database. """ db = self.config.indexdb sort = int(self.config.sort) cr_fac = float(self.config.rankers['content']) pr_fac = float(self.config.rankers['page']) query = SimpleQuery(db) words = query.parse_query(q) urls = query.query(q) if len(urls) == 0: return [] scores = {} valid_fac = {} valid_score = {} if cr_fac > 0: cr = ContentRanker(db, sort) cr_scores = cr.score(urls, words) cr_scores = normalize(cr_scores) valid_fac['content'] = cr_fac valid_score['content'] = cr_scores if pr_fac > 0: pr = PageRanker(db, sort) pr_scores = pr.score(urls, words) pr_scores = normalize(pr_scores) valid_fac['page'] = pr_fac valid_score['page'] = pr_scores for urlid in urls: scores[urlid] = 0.0 try: for key in valid_fac.keys(): scores[urlid] += valid_fac[key]*valid_score[key][urlid] except: ferrmsg("Error: urlid(%s) is not find in the results of each rank." % urlid, \ 'SECore') res = sorted(scores.items(), key=lambda v:v[1], reverse=sort) res = [t[0] for t in res] return res
def get_page(self, url): """ Download the page and analyze all the urls in this page. """ rer = Retriever(url, self.headers) retval = rer.download() if retval is RET_ERROR: return retval self.num_gets += 1 if url not in self.bloom: self.bloom.add(url) p = MyHTMLParser(url) try: p.feed(retval) except: ferrmsg('Error: feed error in url: %s' % url, 'Spider') for link in p.htm_urls(): if (link not in self.bloom) and (link not in self.queue.queue): self.queue.put(link) return retval