def buildTFIDF(self): # Create new tf-idf tables cur2 = self.conn2.cursor() print("DB Initiation - Creating tf-idf tables") cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''') self.conn2.commit() cur2.execute('''CREATE TABLE blogs_tf_idf (blog_id INTEGER, word_id INTEGER, count INTEGER, tf_idf FLOAT, PRIMARY KEY(blog_id,word_id), FOREIGN KEY(word_id) REFERENCES word_dict(id))''') self.conn2.commit() cur2.execute('''CREATE TABLE blogs_title_tf_idf (blog_id INTEGER, word_id INTEGER, count INTEGER, tf_idf FLOAT, PRIMARY KEY(blog_id,word_id), FOREIGN KEY(word_id) REFERENCES word_dict(id))''') self.conn2.commit() cur2.execute('''CREATE TABLE comments_tf_idf (blog_id INTEGER, comment_id INTEGER, word_id INTEGER, count INTEGER, tf_idf FLOAT, PRIMARY KEY(blog_id,comment_id,word_id), FOREIGN KEY(word_id) REFERENCES word_dict(id))''') self.conn2.commit() print("DB TFIDF Initialization - Loop Entries") cur = self.conn.cursor() # Select the title and blog ids form all the blogs allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if self.rowLimit is None else (" LIMIT " + str(self.rowLimit)))) blogsTFIDF = dict() blogsTitleTFIDF = dict() commentsTFIDF = dict() idx = 0 # Loop all the blogs for tf-idf preparation blogCount = Blog.getCount(self.conn) if self.rowLimit is None else self.rowLimit for i in allEntries: # Segment the title and push into the counter allWordsTitle = self.transformTextToIDs(i[1]) titleCounter = collections.Counter(allWordsTitle) eleLen = sum(titleCounter.values()) # There may be cases with no valid words found if (eleLen > 0): blogsTitleTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in titleCounter.items()} # Segment the body and push into the counter allWordsBody = self.transformTextToIDs(i[2]) bodyCounter = collections.Counter(allWordsBody) eleLen = sum(bodyCounter.values()) # There may be cases with no valid words found if (eleLen > 0): blogsTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in bodyCounter.items()} # Get the comments and push all the words comments = Comment.getFromDB(i[0]) commentsTFIDF[i[0]] = dict() for c in comments: allWordsComment = self.transformTextToIDs(c.body) commentCounter = collections.Counter(allWordsComment) eleLen = sum(commentCounter.values()) # There may be cases with no valid words found if (eleLen > 0): commentsTFIDF[i[0]][c.comment_id] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in commentCounter.items()} # Log progresses idx+=1 if (idx % 500 == 0): print("Processing... (", idx/blogCount*100, " %)") # Loop all the data and insert into the db titleTFIDFLen = len(blogsTitleTFIDF) idx = 0 for blog_id,titleWords in blogsTitleTFIDF.items(): for word_id,titleTfidf in titleWords.items(): cur2.execute("INSERT INTO blogs_title_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(titleTfidf[0]) + ", " + str(titleTfidf[1]) + ")") # Log progresses idx += 1 if (idx % 500 == 0): print("Processing - Blog Titles ... (", idx/titleTFIDFLen*100, " %)") # Loop all the data and insert into the db blogTFIDFLen = len(blogsTFIDF) idx = 0 for blog_id,blogWords in blogsTFIDF.items(): for word_id,blogTfidf in blogWords.items(): cur2.execute("INSERT INTO blogs_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(blogTfidf[0]) + ", " + str(blogTfidf[1]) + ")") # Log progresses idx += 1 if (idx % 500 == 0): print("Processing - Blogs ... (", idx/blogTFIDFLen*100, " %)") # Loop all the comments and insert into the db commentTFIDFLen = len(commentsTFIDF) idx = 0 for blog_id,comments in commentsTFIDF.items(): for comment_id,commentWords in comments.items(): for word_id,commentTfidf in commentWords.items(): cur2.execute("INSERT INTO comments_tf_idf VALUES(" + str(blog_id) + ", " + str(comment_id) + ", " + str(word_id) + ", " + str(commentTfidf[0]) + ", " + str(commentTfidf[1]) + ")") # Log progresses idx += 1 if (idx % 500 == 0): print("Processing - Comments ... (", idx/commentTFIDFLen*100, " %)") self.conn2.commit()
def build(conn = sqlite3.connect(DB_FILE), conn2 = sqlite3.connect(DB_FILE2), rowLimit = None, segType = 2): ''' Build the dictionary of all the Chinese words and English words. Parameters ==================================== conn `sqlite3.Connection` - A SQLite connection object for the data source. Default as the a new connection to the global DB_FILE databse file. conn2 `sqlite3.Connection` - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file. rowLimit `int` - The limit row count of blogs to return. segType `int` - 0: by characters; 1: by characters, but remove english words; 2: by jieba Returns ==================================== `WordDict - A dictionary object for the connection of currently building dictionary. ''' cur = conn.cursor() # Count the number of blogs and collect all the blog ids if (rowLimit is None): cur.execute("SELECT COUNT(blog_id) FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit)))) blogCount = cur.fetchall()[0][0] else: blogCount = rowLimit # Create dictionary table in the new db cur2 = conn2.cursor() print("DB Initiation - Creating dictionary table") cur2.execute('''DROP TABLE IF EXISTS word_dict''') cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''') cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''') conn2.commit() cur2.execute('''CREATE TABLE word_dict (word TEXT, id INTEGER, count INTEGER, freq FLOAT, idf FLOAT, PRIMARY KEY(id))''') conn2.commit() wordDict = WordDict(conn, conn2, segType=segType, rowLimit=rowLimit); print("DB Initiation - Loop Entries") # Select the title and blog ids form all the blogs allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit)))) wordCount = dict() idx = 0 wordDict.initalCorpusCount() corpusCount = wordDict.corpusCount # Loop all the blogs for dictionary preparation for i in allEntries: # Segment the title and push into the counter allWordsTitle = WordDict.segment(i[1], segType = segType) wordsTitle = set(allWordsTitle) for w in wordsTitle: wordCount[w] = wordCount.setdefault(w, 0) + 1 # Segment the body and push into the counter allWordsBody = WordDict.segment(i[2], segType = segType) wordsBody = set(allWordsBody) for w in wordsBody: wordCount[w] = wordCount.setdefault(w, 0) + 1 # Get the comments and push all the words comments = Comment.getFromDB(i[0]) for c in comments: allWordsComment = WordDict.segment(c.body, segType = segType) wordsComment = set(allWordsComment) for w in wordsComment: wordCount[w] = wordCount.setdefault(w, 0) + 1 # Log progresses idx+=1 if (idx % 500 == 0): print("Processing... (", idx/blogCount*100, " %)") # Loop all the words and insert into the db wordCountLen = len(wordCount); for idx,w in enumerate(wordCount): line = "INSERT INTO word_dict VALUES('" + w.replace("'","''") + "', " + str(idx) + ", " + str(wordCount[w]) + ", " + str(wordCount[w]/corpusCount) + ", " + str(math.log(corpusCount/wordCount[w])) + ")" cur2.execute(line) if (idx % 500 == 0): print("Insertion... (", idx/wordCountLen*100, " %)") conn2.commit() return wordDict