コード例 #1
0
    def buildTFIDF(self):
        # Create new tf-idf tables
        cur2 = self.conn2.cursor()
        print("DB Initiation - Creating tf-idf tables")
        cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''')
        self.conn2.commit()
        cur2.execute('''CREATE TABLE blogs_tf_idf
                    (blog_id    INTEGER, 
                    word_id     INTEGER,
                    count       INTEGER,
                    tf_idf      FLOAT,
                    PRIMARY KEY(blog_id,word_id),
                    FOREIGN KEY(word_id) REFERENCES word_dict(id))''')
        self.conn2.commit()
        cur2.execute('''CREATE TABLE blogs_title_tf_idf
                    (blog_id    INTEGER, 
                    word_id     INTEGER,
                    count       INTEGER,
                    tf_idf      FLOAT,
                    PRIMARY KEY(blog_id,word_id),
                    FOREIGN KEY(word_id) REFERENCES word_dict(id))''')
        self.conn2.commit()
        cur2.execute('''CREATE TABLE comments_tf_idf
                    (blog_id    INTEGER, 
                    comment_id  INTEGER,
                    word_id     INTEGER,
                    count       INTEGER,
                    tf_idf      FLOAT,
                    PRIMARY KEY(blog_id,comment_id,word_id),
                    FOREIGN KEY(word_id) REFERENCES word_dict(id))''')
        self.conn2.commit()
        

        print("DB TFIDF Initialization - Loop Entries")
        cur = self.conn.cursor()
        # Select the title and blog ids form all the blogs
        allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if self.rowLimit is None else (" LIMIT " + str(self.rowLimit))))
        blogsTFIDF = dict()
        blogsTitleTFIDF = dict()
        commentsTFIDF = dict()
        idx = 0

        # Loop all the blogs for tf-idf preparation
        blogCount = Blog.getCount(self.conn) if self.rowLimit is None else self.rowLimit
        for i in allEntries:
            # Segment the title and push into the counter
            allWordsTitle = self.transformTextToIDs(i[1])
            titleCounter = collections.Counter(allWordsTitle)
            eleLen = sum(titleCounter.values())
            # There may be cases with no valid words found
            if (eleLen > 0):
                blogsTitleTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in titleCounter.items()}
           
            # Segment the body and push into the counter
            allWordsBody = self.transformTextToIDs(i[2])
            bodyCounter = collections.Counter(allWordsBody)
            eleLen = sum(bodyCounter.values())
            # There may be cases with no valid words found
            if (eleLen > 0):
                blogsTFIDF[i[0]] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in bodyCounter.items()}

            # Get the comments and push all the words
            comments = Comment.getFromDB(i[0])
            commentsTFIDF[i[0]] = dict()
            for c in comments:
                allWordsComment = self.transformTextToIDs(c.body)
                commentCounter = collections.Counter(allWordsComment)
                eleLen = sum(commentCounter.values())
                # There may be cases with no valid words found
                if (eleLen > 0):
                    commentsTFIDF[i[0]][c.comment_id] = {w[1]: (ctn, ctn/eleLen*w[4]) for w,ctn in commentCounter.items()}

            # Log progresses
            idx+=1
            if (idx % 500 == 0):
                print("Processing... (", idx/blogCount*100, " %)")
        
        # Loop all the data and insert into the db
        titleTFIDFLen = len(blogsTitleTFIDF)
        idx = 0
        for blog_id,titleWords in blogsTitleTFIDF.items():
            for word_id,titleTfidf in titleWords.items():
                cur2.execute("INSERT INTO blogs_title_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(titleTfidf[0]) + ", " + str(titleTfidf[1]) + ")")
            
            # Log progresses
            idx += 1
            if (idx % 500 == 0):
                print("Processing - Blog Titles ... (", idx/titleTFIDFLen*100, " %)")
        
        # Loop all the data and insert into the db
        blogTFIDFLen = len(blogsTFIDF)
        idx = 0
        for blog_id,blogWords in blogsTFIDF.items():
            for word_id,blogTfidf in blogWords.items():
                cur2.execute("INSERT INTO blogs_tf_idf VALUES(" + str(blog_id) + ", " + str(word_id) + ", " + str(blogTfidf[0]) + ", " + str(blogTfidf[1]) + ")")
            
            # Log progresses
            idx += 1
            if (idx % 500 == 0):
                print("Processing - Blogs ... (", idx/blogTFIDFLen*100, " %)")
        
        # Loop all the comments and insert into the db
        commentTFIDFLen = len(commentsTFIDF)
        idx = 0
        for blog_id,comments in commentsTFIDF.items():
            for comment_id,commentWords in comments.items():
                for word_id,commentTfidf in commentWords.items():
                    cur2.execute("INSERT INTO comments_tf_idf VALUES(" + str(blog_id) + ", " + str(comment_id) + ", " + str(word_id) + ", " + str(commentTfidf[0]) + ", " + str(commentTfidf[1]) + ")")
            # Log progresses
            idx += 1
            if (idx % 500 == 0):
                print("Processing - Comments ... (", idx/commentTFIDFLen*100, " %)")
        
        self.conn2.commit()
コード例 #2
0
    def build(conn = sqlite3.connect(DB_FILE), conn2 = sqlite3.connect(DB_FILE2), rowLimit = None, segType = 2):
        ''' Build the dictionary of all the Chinese words and English words.

            Parameters
            ====================================

            conn    `sqlite3.Connection`    - A SQLite connection object for the data source. Default as the a new connection to the global DB_FILE databse file.
            conn2    `sqlite3.Connection`   - A SQLite connection object for the word dictionary. Default as the a new connection to the global DB_FILE2 databse file.
            rowLimit    `int`               - The limit row count of blogs to return.
            segType     `int`               - 0: by characters; 1: by characters, but remove english words; 2: by jieba

            Returns
            ====================================

            `WordDict - A dictionary object for the connection of currently building dictionary.
        '''

        cur = conn.cursor()

        # Count the number of blogs and collect all the blog ids
        if (rowLimit is None):
            cur.execute("SELECT COUNT(blog_id) FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit))))
            blogCount = cur.fetchall()[0][0]
        else:
            blogCount = rowLimit

        # Create dictionary table in the new db
        cur2 = conn2.cursor()
        print("DB Initiation - Creating dictionary table")
        cur2.execute('''DROP TABLE IF EXISTS word_dict''')
        cur2.execute('''DROP TABLE IF EXISTS blogs_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS blogs_title_tf_idf''')
        cur2.execute('''DROP TABLE IF EXISTS comments_tf_idf''')
        conn2.commit()
        cur2.execute('''CREATE TABLE word_dict
                    (word    TEXT, 
                    id       INTEGER,
                    count    INTEGER,
                    freq     FLOAT,
                    idf      FLOAT,
                    PRIMARY KEY(id))''')
        conn2.commit()

        wordDict = WordDict(conn, conn2, segType=segType, rowLimit=rowLimit);


        print("DB Initiation - Loop Entries")
        # Select the title and blog ids form all the blogs
        allEntries = cur.execute("SELECT blog_id,title,body FROM blogs" + ("" if rowLimit is None else (" LIMIT " + str(rowLimit))))
        wordCount = dict()
        idx = 0
        wordDict.initalCorpusCount()
        corpusCount = wordDict.corpusCount

        # Loop all the blogs for dictionary preparation
        for i in allEntries:
            # Segment the title and push into the counter
            allWordsTitle = WordDict.segment(i[1], segType = segType)
            wordsTitle = set(allWordsTitle)
            for w in wordsTitle:
                wordCount[w] = wordCount.setdefault(w, 0) + 1
            
            # Segment the body and push into the counter
            allWordsBody = WordDict.segment(i[2], segType = segType)
            wordsBody = set(allWordsBody)
            for w in wordsBody:
                wordCount[w] = wordCount.setdefault(w, 0) + 1

            # Get the comments and push all the words
            comments = Comment.getFromDB(i[0])
            for c in comments:
                allWordsComment = WordDict.segment(c.body, segType = segType)
                wordsComment = set(allWordsComment)
                for w in wordsComment:
                    wordCount[w] = wordCount.setdefault(w, 0) + 1

            # Log progresses
            idx+=1
            if (idx % 500 == 0):
                print("Processing... (", idx/blogCount*100, " %)")

        # Loop all the words and insert into the db
        wordCountLen = len(wordCount);
        for idx,w in enumerate(wordCount):
            line = "INSERT INTO word_dict VALUES('" + w.replace("'","''") + "', " + str(idx) + ", " + str(wordCount[w]) + ", " + str(wordCount[w]/corpusCount) + ", " + str(math.log(corpusCount/wordCount[w])) + ")"
            cur2.execute(line)
            if (idx % 500 == 0):
                print("Insertion... (", idx/wordCountLen*100, " %)")
        
        conn2.commit()

        return wordDict