def addEdge(db, G, id_list, id_): db_lock.acquire() pointingTo_res = db.executeSelectSql( "SELECT pointing FROM url_tb WHERE id=%s", (id_, )) db_lock.release() # print(pointingTo_res) if pointingTo_res and pointingTo_res[0] and pointingTo_res[0][0]: pointingTo = pointingTo_res[0][0] pointing_num_set = set( [int(x) for x in pointingTo.split(",") if int(x) in id_list]) # 获得 # print(pointing_num_set) for pointing_num_id in pointing_num_set: G.add_edge(id_, pointing_num_id) print(id_)
def saveIndexToDb(db, id_, word_dict): # 这里的word_dict是对应每一页的单词的倒排索引,因此docId是唯一的 re_id = re.compile("{0}/\d*".format(id_)) for word in word_dict.keys(): # 对于词典中的每一个词 db_lock.acquire() res_wordIndex = db.executeSelectSql( "SELECT result FROM dict_tb1 WHERE word=%s", (word, )) db_lock.release() if res_wordIndex and res_wordIndex[0] and res_wordIndex[0][ 0]: # 有这个单词的倒排索引 wordIndex = res_wordIndex[0][0] mat = re_id.search(wordIndex) if mat != None: # 该单词的倒排索引中已经记录该docId continue else: wordIndex += ",{0}/{1}".format(id_, word_dict[word]) db_lock.acquire() db.executeUpdateSql("UPDATE dict_tb1 SET result=%s WHERE word=%s", (wordIndex, word)) db_lock.release() else: # 没有这个单词的倒排索引 wordDict_str = "{0}/{1}".format(id_, word_dict[word]) db_lock.acquire() db.executeUpdateSql( "INSERT INTO dict_tb1 (word,result) VALUES(%s,%s)", (word, wordDict_str)) db_lock.release() print("向数据库插入一个word_dict:" + str(id_))
def parseOnePageHtml(db, id): print("%s" % (id)) db_lock.acquire() res_content = db.executeSelectSql("SELECT content from url_tb WHERE id=%s", (id, )) res_url = db.executeSelectSql("SELECT url from url_tb WHERE id=%s", (id, )) db_lock.release() url_num = [] # print(res_content) if res_content and res_content[0] and res_content[0][ 0] and res_url and res_url[0] and res_url[0][0]: url_list = parseHrefFromHtml(res_url[0][0], res_content[0][0]) print(url_list) for url in url_list: # 对于链接中的每一个 # print(url) db_lock.acquire() url_id_res = db.executeSelectSql( "SELECT id FROM url_tb WHERE url=%s", (url, )) db_lock.release() # print(url_id_res) if url_id_res and url_id_res[0] and url_id_res[0][0]: url_id = url_id_res[0][0] url_num.append(str(url_id)) new_pointing = ','.join(url_num) print(new_pointing) db_lock.acquire() db.executeUpdateSql("UPDATE url_tb SET pointing=%s WHERE id=%s", (new_pointing, id)) db_lock.release()
def buildOnePage(db, id_): print("开始建立第{0}页的分词索引".format(id_)) word_dict = {} db_lock.acquire() res_content = db.executeSelectSql("SELECT content FROM url_tb WHERE id=%s", (id_, )) db_lock.release() if res_content and res_content[0] and res_content[0][0]: html = res_content[0][0] title = getTitleFromHtml(html) content = rm_tags(html) saveTitleTextToDb(db, content, title, id_) seglist = jieba.cut_for_search(content) # print(",".join(seglist)) for word in seglist: if word_dict.get(word) is None: # 在字典中 word_dict[word] += 1 else: word_dict[word] = 1 # print(word_dict) saveIndexToDb(db, id_, word_dict) print("单页的分词索引已建立")
def saveTitleTextToDb(db, content, title, id_): db_lock.acquire() db.executeUpdateSql("UPDATE url_tb SET text=%s WHERE id=%s", (content, id_)) db.executeUpdateSql("UPDATE url_tb SET title=%s WHERE id=%s", (title, id_)) db_lock.release()