def index(self, db): """ Index the given database. Index steps consist of: 1, seperate the content into individual words. 2, record each word. 3, calculate the term frequency in current page. Note: index process is time-wasting. """ conn = sqlite3.connect(db) cur = conn.cursor() conn.text_factory = str dbname = db[:-3] sql = "select url from %s" % dbname urls = [ url[0] for url in cur.execute(sql).fetchall()] progress = ProgressMeter(total=len(urls)) # traverse all webpages for (cnt, url) in enumerate(urls): urlid = self.getid('urllist','url',url) sql = "select content from %s where url='%s'" % (dbname, url) html = cur.execute(sql).fetchone()[0] items = self.getitems(html) title = replace_quote(items['title']) sql = "insert into urltitle values(%d,'%s')" % (urlid, title) self.cur.execute(sql) content = items['content'] words = self.analyzer.run(content) tfdir = {} # traverse all words in current webpage for i in range(len(words)): word = words[i] if word not in tfdir: tfdir[word] = 1 else: tfdir[word] += 1 wordid = self.getid('wordlist','word',word) sql = "insert into wordlocation values(%d,%d,%d)" % (urlid, wordid, i) self.cur.execute(sql) for (word, tf) in tfdir.items(): wordid = self.getid('wordlist','word',word) sql = "insert into wordinfo values(%d,%d,%f)" % \ (urlid, wordid, float(tf)/len(words)) self.cur.execute(sql) # update the progress if (cnt % REFRESH_CNT) == 0 or cnt == progress.total-1: progress.update(cnt+1) del progress cur.close()
def merge(self, tardb, srcdbs): """ Merge the distribute databases. For sqlite does not support multi-thread very well, so each spider will save its own collected pages in an individual database. With this funcion, it could merge the distribute databases to a single one. """ tconn = sqlite3.connect(tardb) tcur = tconn.cursor() sql = "create table %s (url text, content text)" % tardb[:-3] tcur.execute(sql) for src in srcdbs: print "Processing %s ..." % src sconn = sqlite3.connect(src) scur = sconn.cursor() sconn.text_factory = str sql = "select * from %s " % src[:-3] contents = scur.execute(sql).fetchall() for content in contents: sql = "insert into %s (url, content) values('%s','%s')" % \ (tardb[:-3], content[0], replace_quote(content[1])) tcur.execute(sql) scur.close() tconn.commit() tconn.close()
def save_page(self, cur, url, data): """ Save the whole page to the database. """ data = replace_quote(data) sql = "insert into %s values('%s','%s');" % (self.name, url, data) cur.execute(sql)