def get_tf_idf(str_list: list)->dict: frameinfo = getframeinfo(currentframe()) with mongodb.Mongodb() as db: log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") started') tf_idf = db.search_any("record", "the標題", "tf_idf_dict") if tf_idf: log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") finished') tf_idf_dict = tf_idf[0] else: log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") failed') tf_idf_dict = {"the標題": "tf_idf_dict"} log(getframeinfo(currentframe()), 'tf_idf_dict add new word started') x = len(tf_idf_dict) for word in str_list: if '.' in word: changed_word = word.replace('.', '*') else: changed_word = word if changed_word not in tf_idf_dict: tf_idf_dict[changed_word] = x x += 1 log(getframeinfo(currentframe()), 'tf_idf_dict add new word finished') log(getframeinfo(currentframe()), 'cleaning db record started') db.db["record"].remove({"the標題": "tf_idf_dict"}) log(getframeinfo(currentframe()), 'cleaning db record finished') log(getframeinfo(currentframe()), 'insert tf_idf_dict to db started') db.insert_one("record", tf_idf_dict) log(getframeinfo(currentframe()), 'insert tf_idf_dict to db finished') return tf_idf_dict
def go_go_id(start: int, end: int) -> None: raw_articles_list = [] jie_ba_articles_list = [] with mongodb.Mongodb() as db: log(getframeinfo(currentframe()), 'get all data in "articles"') raw_articles_list = db.num_articles("articles", start, end - start + 1) log(getframeinfo(currentframe()), 'get all data in "articles" finished') log(getframeinfo(currentframe()), 'get all data in "jie_ba_Articles"') jie_ba_articles_list = db.num_articles("jie_ba_Articles", start, end - start + 1) log(getframeinfo(currentframe()), 'get all data in "jie_ba_Articles" finished') total = len(jie_ba_articles_list) assert len(raw_articles_list) == total, \ 'collection "articles"{} and collection "jie_ba_Articles"{} mismatch!'.format(len(raw_articles_list), total) log(getframeinfo(currentframe()), 'synthesising result list id started') for jieba_article, raw_article in zip(jie_ba_articles_list, raw_articles_list): db.update_one_id("jie_ba_Articles", jieba_article["_id"], raw_article["_id"]) log(getframeinfo(currentframe()), 'synthesising result list id finished')
def interface(search_key: str) -> list: raw_articles_list = [] jie_ba_articles_list = [] with mongodb.Mongodb() as db: log(getframeinfo(currentframe()), 'searching title in "articles":', search_key) raw_articles_list = db.search_title("articles", search_key) log(getframeinfo(currentframe()), 'searching title in "articles":', search_key, 'finished') log(getframeinfo(currentframe()), 'searching title in "jie_ba_Articles":', search_key) jie_ba_articles_list = db.search_title("jie_ba_Articles", search_key) log(getframeinfo(currentframe()), 'searching title in "jie_ba_Articles":', search_key, 'finished') result_list = [] current_progress = 1 total = len(jie_ba_articles_list) assert len(raw_articles_list) == total, \ 'collection "articles"{} and collection "jie_ba_Articles"{} mismatch!'.format(len(raw_articles_list), total) log(getframeinfo(currentframe()), 'synthesising result list started') for jieba_article, raw_article in zip(jie_ba_articles_list, raw_articles_list): result_dict = {**jieba_article, **raw_article} result_list.append(result_dict) log(getframeinfo(currentframe()), 'articles ', current_progress, '/', total, ' encoded') current_progress += 1 log(getframeinfo(currentframe()), 'synthesising result list finished') return result_list
def get_all_data() -> list: raw_articles_list = [] jie_ba_articles_list = [] with mongodb.Mongodb() as db: log(getframeinfo(currentframe()), 'get all data in "articles"') raw_articles_list = db.db_all("articles") log(getframeinfo(currentframe()), 'get all data in "articles" finished') log(getframeinfo(currentframe()), 'get all data in "jie_ba_Articles"') jie_ba_articles_list = db.db_all("jie_ba_Articles") log(getframeinfo(currentframe()), 'get all data in "jie_ba_Articles" finished') result_list = [] current_progress = 1 total = len(jie_ba_articles_list) assert len(raw_articles_list) == total,\ 'collection "articles"{} and collection "jie_ba_Articles"{} mismatch!'.format(len(raw_articles_list), total) log(getframeinfo(currentframe()), 'synthesising result list started') for jieba_article, raw_article in zip(jie_ba_articles_list, raw_articles_list): result_dict = {**jieba_article, **raw_article} result_list.append(result_dict) log(getframeinfo(currentframe()), 'articles ', current_progress, '/', total, ' encoded') current_progress += 1 log(getframeinfo(currentframe()), 'synthesising result list finished') return result_list
def idf_dict_first_process() -> None: with mongodb.Mongodb() as db: d = db.search_any("record", "the標題", "idf_dict") if not d: c = {"THE總共": 0, "the標題": "idf_dict"} crawler.json_write("idf_dict.txt", c) else: del d[0]['_id'] crawler.json_write("idf_dict.txt", d[0])
def up_dict() -> None: tf_dict = crawler.json_read("tf_dict.txt") idf_dict = crawler.json_read("idf_dict.txt") with mongodb.Mongodb() as db: db.db["record"].remove({"the標題": "tf_dict"}) db.db["record"].remove({"the標題": "idf_dict"}) db.insert_one("record", tf_dict) # !!!!!!!!!!需更新的function db.insert_one("record", idf_dict) # !!!!!!!!!!
def decode(query: list) -> list: with mongodb.Mongodb() as db: the_dict = db.search_any("record", "the標題", "tf_idf_dict")[0] for word, num in the_dict.items(): for q in list(range(0, len(query))): if query[q] == num: query[q] = word return query
def go_go_go(num: int) -> None: with mongodb.Mongodb() as db: original_db_data = db.db_all("articles") jie_ba_db_data = db.db_all("jie_ba_Articles") if len(jie_ba_db_data) + num < len(original_db_data): a = len(jie_ba_db_data) + num - 1 else: a = len(original_db_data) for i in range(len(jie_ba_db_data), (len(jie_ba_db_data) + num)): if i < len(original_db_data): jie_ba_return = get_JIEBA.get_jie_ba( original_db_data[i]["content"]) jie_ba_return["title"] = original_db_data[i]["title"] db.insert_one("jie_ba_Articles", jie_ba_return) print("{0}/{1} finished!".format(i, a))
def go_go_encode(start: int, end: int) -> None: with mongodb.Mongodb() as db: jie_ba_db_data = db.num_articles("jie_ba_Articles", start, end - start + 1) tf_idf_dict = db.search_any("record", "the標題", "tf_idf_dict")[0] if len(jie_ba_db_data) < end - start: a = start + len(jie_ba_db_data) else: a = end + 1 x = 1 for i in jie_ba_db_data: if "encode" not in i: encode = [] for word in i["segments"]: encode.append(tf_idf_dict[word]) db.update_one_encode("jie_ba_Articles", i["_id"], encode) log(getframeinfo(currentframe()), x, "/", a, " finished!") x += 1
def interface(search_key: str) -> list: with mongodb.Mongodb() as db: ''' original_db_data = db.db_all("articles") jie_ba_db_data = db.db_all("jie_ba_Articles") if len(jie_ba_db_data) < len(original_db_data): print('original articles has {0} , jie ba articles has {1}'.format(len(original_db_data), len(jie_ba_db_data))) an = input('press number 0 to update or any key to skip') if an == 0: for i in range(len(jie_ba_db_data), len(original_db_data)): jie_ba_return = get_JIEBA.get_jie_ba(original_db_data[i]["content"]) jie_ba_return["title"] = original_db_data[i]["title"] db.insert_one("jie_ba_Articles", jie_ba_return) print("{0}/{1} finished!".format(i, len(original_db_data))) ''' log(getframeinfo(currentframe()), 'fetching get_tf_idf') tf_idf_dict = get_JIEBA.get_tf_idf() log(getframeinfo(currentframe()), 'fetching get_tf_idf finished') log(getframeinfo(currentframe()), 'searching title in "articles":', search_key) articles_list = db.search_title("articles", search_key) log(getframeinfo(currentframe()), 'searching title in "articles":', search_key, 'finished') log(getframeinfo(currentframe()), 'searching title in "jie_ba_Articles":', search_key) jie_ba_articles_list = db.search_title("jie_ba_Articles", search_key) log(getframeinfo(currentframe()), 'searching title in "jie_ba_Articles":', search_key, 'finished') log(getframeinfo(currentframe()), 'jie_ba_articles_list processing started') for a in jie_ba_articles_list: w_num = 0 count = 99 temp = list(range(0, len(articles_list))) while len(temp) > 1: w2 = a["segments"][w_num] for ind in temp: if articles_list[int(ind)]["content"].find(w2) == -1: del temp[temp.index(ind)] elif articles_list[int(ind)]["content"].find(w2) > count: del temp[temp.index(ind)] elif articles_list[int(ind)]["content"].find(w2) < count: count = articles_list[int(ind)]["content"].find(w2) temp = temp[temp.index(ind):] w_num += 1 count = 99 a["author"] = articles_list[int(temp[0])]["author"] a["label"] = articles_list[int(temp[0])]["label"] a["url"] = articles_list[int(temp[0])]["url"] a["date_added"] = articles_list[int(temp[0])]["date_added"] a["content"] = articles_list[int(temp[0])]["content"] log(getframeinfo(currentframe()), 'jie_ba_articles_list processing finished') log(getframeinfo(currentframe()), 'tf_idf_dict synthesising started') x = 1 for article in jie_ba_articles_list: encode = [] for word in article["segments"]: encode.append(tf_idf_dict[word]) article["encoded"] = encode print("{0}/{1} encoded".format(x, len(jie_ba_articles_list))) x += 1 log(getframeinfo(currentframe()), 'tf_idf_dict synthesising finished') return jie_ba_articles_list
def get_tf_idf() -> dict: frameinfo = getframeinfo(currentframe()) with mongodb.Mongodb() as db: log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") started') tf_idf = db.search_any("record", "the標題", "tf_idf_dict") if tf_idf: log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") finished') return tf_idf log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") failed') tf_dict = {"THE總共": 0} idf_dict = {"THE總共": 0} log(getframeinfo(currentframe()), 'db.db_all("jie_ba_Articles") started') jie_ba_articles_list = db.db_all("jie_ba_Articles") log(getframeinfo(currentframe()), 'db.db_all("jie_ba_Articles") finished') temp = [] log(getframeinfo(currentframe()), 'jie_ba_articles_list processing started') for one_articles in jie_ba_articles_list: jie_ba_word_list = one_articles["segments"] for word in jie_ba_word_list: if word in tf_dict: # 計算出現次數 與 總辭數 tf_dict[word] += 1 else: tf_dict[word] = 1 tf_dict["THE總共"] += 1 if word not in temp: # 計算出現文章數 temp.append(word) for w in temp: if w in idf_dict: idf_dict[w] += 1 else: idf_dict[w] = 1 idf_dict["THE總共"] += 1 temp.clear() log(getframeinfo(currentframe()), 'jie_ba_articles_list processing finished') log(getframeinfo(currentframe()), 'idf_dict processing started') for i in idf_dict: if i != "THE總共" and i != "the標題": idf_dict[i] = 1 - (idf_dict[i] / (idf_dict["THE總共"] + 1)) # idf_dict[i] = math.log10(idf_dict["THE總共"]+1 / idf_dict[i]) log(getframeinfo(currentframe()), 'idf_dict processing finished') tf_idf_dict = {"the標題": "tf_idf_dict"} log(getframeinfo(currentframe()), 'tf_dict processing started') for i in tf_dict: if i != "THE總共" and i != "the標題": # tf_idf_dict[i] = math.log10(tf_dict[i] * idf_dict[i]) tf_idf_dict[i] = math.log10(tf_dict[i]) * idf_dict[i] log(getframeinfo(currentframe()), 'tf_dict processing finished') tf_idf_list = sorted(tf_idf_dict.items(), key=operator.itemgetter(1), reverse=True) log(getframeinfo(currentframe()), 'tf_idf_list processing started') x = 1 for key, value in tf_idf_list: if key != "THE總共": tf_idf_dict[key] = x x += 1 log(getframeinfo(currentframe()), 'tf_idf_list processing finished') log(getframeinfo(currentframe()), 'writing tf_idf_dick.txt to local disk started') crawler.json_write("tf_idf_dict.txt", tf_idf_dict) log(getframeinfo(currentframe()), 'writing tf_idf_dick.txt to local disk finished') log(getframeinfo(currentframe()), 'cleaning db record started') db.db["record"].remove({"the標題": "tf_idf_dict"}) log(getframeinfo(currentframe()), 'cleaning db record finished') log(getframeinfo(currentframe()), 'insert tf_idf_dict to db started') db.insert_one("record", tf_idf_dict) log(getframeinfo(currentframe()), 'insert tf_idf_dict to db finished') return tf_idf_dict