Ejemplo n.º 1
0
def get_tf_idf(str_list: list)->dict:
    frameinfo = getframeinfo(currentframe())

    with mongodb.Mongodb() as db:

        log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") started')
        tf_idf = db.search_any("record", "the標題", "tf_idf_dict")
        if tf_idf:
            log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") finished')
            tf_idf_dict = tf_idf[0]
        else:
            log(getframeinfo(currentframe()), 'db.search_any("record", "the標題", "tf_idf_dict") failed')
            tf_idf_dict = {"the標題": "tf_idf_dict"}

        log(getframeinfo(currentframe()), 'tf_idf_dict add new word started')
        x = len(tf_idf_dict)
        for word in str_list:
            if '.' in word:
                changed_word = word.replace('.', '*')
            else:
                changed_word = word
            if changed_word not in tf_idf_dict:
                tf_idf_dict[changed_word] = x
                x += 1
        log(getframeinfo(currentframe()), 'tf_idf_dict add new word finished')

        log(getframeinfo(currentframe()), 'cleaning db record started')
        db.db["record"].remove({"the標題": "tf_idf_dict"})
        log(getframeinfo(currentframe()), 'cleaning db record finished')

        log(getframeinfo(currentframe()), 'insert tf_idf_dict to db started')
        db.insert_one("record", tf_idf_dict)
        log(getframeinfo(currentframe()), 'insert tf_idf_dict to db finished')

        return tf_idf_dict
Ejemplo n.º 2
0
def go_go_id(start: int, end: int) -> None:
    raw_articles_list = []
    jie_ba_articles_list = []
    with mongodb.Mongodb() as db:
        log(getframeinfo(currentframe()), 'get all data in "articles"')
        raw_articles_list = db.num_articles("articles", start, end - start + 1)
        log(getframeinfo(currentframe()),
            'get all data in "articles" finished')

        log(getframeinfo(currentframe()), 'get all data in "jie_ba_Articles"')
        jie_ba_articles_list = db.num_articles("jie_ba_Articles", start,
                                               end - start + 1)
        log(getframeinfo(currentframe()),
            'get all data in "jie_ba_Articles" finished')

    total = len(jie_ba_articles_list)
    assert len(raw_articles_list) == total, \
        'collection "articles"{} and collection "jie_ba_Articles"{} mismatch!'.format(len(raw_articles_list), total)

    log(getframeinfo(currentframe()), 'synthesising result list id started')
    for jieba_article, raw_article in zip(jie_ba_articles_list,
                                          raw_articles_list):
        db.update_one_id("jie_ba_Articles", jieba_article["_id"],
                         raw_article["_id"])
    log(getframeinfo(currentframe()), 'synthesising result list id finished')
Ejemplo n.º 3
0
def interface(search_key: str) -> list:
    raw_articles_list = []
    jie_ba_articles_list = []
    with mongodb.Mongodb() as db:
        log(getframeinfo(currentframe()), 'searching title in "articles":',
            search_key)
        raw_articles_list = db.search_title("articles", search_key)
        log(getframeinfo(currentframe()), 'searching title in "articles":',
            search_key, 'finished')

        log(getframeinfo(currentframe()),
            'searching title in "jie_ba_Articles":', search_key)
        jie_ba_articles_list = db.search_title("jie_ba_Articles", search_key)
        log(getframeinfo(currentframe()),
            'searching title in "jie_ba_Articles":', search_key, 'finished')

    result_list = []
    current_progress = 1
    total = len(jie_ba_articles_list)
    assert len(raw_articles_list) == total, \
        'collection "articles"{} and collection "jie_ba_Articles"{} mismatch!'.format(len(raw_articles_list), total)

    log(getframeinfo(currentframe()), 'synthesising result list started')
    for jieba_article, raw_article in zip(jie_ba_articles_list,
                                          raw_articles_list):
        result_dict = {**jieba_article, **raw_article}
        result_list.append(result_dict)
        log(getframeinfo(currentframe()), 'articles ', current_progress, '/',
            total, ' encoded')
        current_progress += 1
    log(getframeinfo(currentframe()), 'synthesising result list finished')

    return result_list
Ejemplo n.º 4
0
def get_all_data() -> list:
    raw_articles_list = []
    jie_ba_articles_list = []
    with mongodb.Mongodb() as db:
        log(getframeinfo(currentframe()), 'get all data in "articles"')
        raw_articles_list = db.db_all("articles")
        log(getframeinfo(currentframe()),
            'get all data in "articles" finished')

        log(getframeinfo(currentframe()), 'get all data in "jie_ba_Articles"')
        jie_ba_articles_list = db.db_all("jie_ba_Articles")
        log(getframeinfo(currentframe()),
            'get all data in "jie_ba_Articles" finished')

    result_list = []
    current_progress = 1
    total = len(jie_ba_articles_list)
    assert len(raw_articles_list) == total,\
        'collection "articles"{} and collection "jie_ba_Articles"{} mismatch!'.format(len(raw_articles_list), total)

    log(getframeinfo(currentframe()), 'synthesising result list started')
    for jieba_article, raw_article in zip(jie_ba_articles_list,
                                          raw_articles_list):
        result_dict = {**jieba_article, **raw_article}
        result_list.append(result_dict)
        log(getframeinfo(currentframe()), 'articles ', current_progress, '/',
            total, ' encoded')
        current_progress += 1
    log(getframeinfo(currentframe()), 'synthesising result list finished')

    return result_list
Ejemplo n.º 5
0
def idf_dict_first_process() -> None:
    with mongodb.Mongodb() as db:
        d = db.search_any("record", "the標題", "idf_dict")
        if not d:
            c = {"THE總共": 0, "the標題": "idf_dict"}
            crawler.json_write("idf_dict.txt", c)
        else:
            del d[0]['_id']
            crawler.json_write("idf_dict.txt", d[0])
Ejemplo n.º 6
0
def up_dict() -> None:
    tf_dict = crawler.json_read("tf_dict.txt")
    idf_dict = crawler.json_read("idf_dict.txt")

    with mongodb.Mongodb() as db:
        db.db["record"].remove({"the標題": "tf_dict"})
        db.db["record"].remove({"the標題": "idf_dict"})
        db.insert_one("record", tf_dict)  # !!!!!!!!!!需更新的function
        db.insert_one("record", idf_dict)  # !!!!!!!!!!
Ejemplo n.º 7
0
def decode(query: list) -> list:

    with mongodb.Mongodb() as db:

        the_dict = db.search_any("record", "the標題", "tf_idf_dict")[0]

        for word, num in the_dict.items():
            for q in list(range(0, len(query))):
                if query[q] == num:
                    query[q] = word

        return query
Ejemplo n.º 8
0
def go_go_go(num: int) -> None:

    with mongodb.Mongodb() as db:

        original_db_data = db.db_all("articles")
        jie_ba_db_data = db.db_all("jie_ba_Articles")

        if len(jie_ba_db_data) + num < len(original_db_data):
            a = len(jie_ba_db_data) + num - 1
        else:
            a = len(original_db_data)

        for i in range(len(jie_ba_db_data), (len(jie_ba_db_data) + num)):
            if i < len(original_db_data):
                jie_ba_return = get_JIEBA.get_jie_ba(
                    original_db_data[i]["content"])
                jie_ba_return["title"] = original_db_data[i]["title"]
                db.insert_one("jie_ba_Articles", jie_ba_return)
                print("{0}/{1} finished!".format(i, a))
Ejemplo n.º 9
0
def go_go_encode(start: int, end: int) -> None:

    with mongodb.Mongodb() as db:

        jie_ba_db_data = db.num_articles("jie_ba_Articles", start,
                                         end - start + 1)
        tf_idf_dict = db.search_any("record", "the標題", "tf_idf_dict")[0]

        if len(jie_ba_db_data) < end - start:
            a = start + len(jie_ba_db_data)
        else:
            a = end + 1

        x = 1
        for i in jie_ba_db_data:
            if "encode" not in i:
                encode = []
                for word in i["segments"]:
                    encode.append(tf_idf_dict[word])
                db.update_one_encode("jie_ba_Articles", i["_id"], encode)
            log(getframeinfo(currentframe()), x, "/", a, " finished!")
            x += 1
Ejemplo n.º 10
0
def interface(search_key: str) -> list:

    with mongodb.Mongodb() as db:
        '''
        original_db_data = db.db_all("articles")
        jie_ba_db_data = db.db_all("jie_ba_Articles")

        if len(jie_ba_db_data) < len(original_db_data):
            print('original articles has {0} , jie ba articles has {1}'.format(len(original_db_data),
                                                                               len(jie_ba_db_data)))
            an = input('press number 0 to update or any key to skip')

            if an == 0:
                for i in range(len(jie_ba_db_data), len(original_db_data)):
                    jie_ba_return = get_JIEBA.get_jie_ba(original_db_data[i]["content"])
                    jie_ba_return["title"] = original_db_data[i]["title"]
                    db.insert_one("jie_ba_Articles", jie_ba_return)
                    print("{0}/{1} finished!".format(i, len(original_db_data)))
        '''
        log(getframeinfo(currentframe()), 'fetching get_tf_idf')
        tf_idf_dict = get_JIEBA.get_tf_idf()
        log(getframeinfo(currentframe()), 'fetching get_tf_idf finished')

        log(getframeinfo(currentframe()), 'searching title in "articles":',
            search_key)
        articles_list = db.search_title("articles", search_key)
        log(getframeinfo(currentframe()), 'searching title in "articles":',
            search_key, 'finished')

        log(getframeinfo(currentframe()),
            'searching title in "jie_ba_Articles":', search_key)
        jie_ba_articles_list = db.search_title("jie_ba_Articles", search_key)
        log(getframeinfo(currentframe()),
            'searching title in "jie_ba_Articles":', search_key, 'finished')

        log(getframeinfo(currentframe()),
            'jie_ba_articles_list processing started')
        for a in jie_ba_articles_list:
            w_num = 0
            count = 99
            temp = list(range(0, len(articles_list)))
            while len(temp) > 1:
                w2 = a["segments"][w_num]
                for ind in temp:
                    if articles_list[int(ind)]["content"].find(w2) == -1:
                        del temp[temp.index(ind)]
                    elif articles_list[int(ind)]["content"].find(w2) > count:
                        del temp[temp.index(ind)]
                    elif articles_list[int(ind)]["content"].find(w2) < count:
                        count = articles_list[int(ind)]["content"].find(w2)
                        temp = temp[temp.index(ind):]
                w_num += 1
                count = 99

            a["author"] = articles_list[int(temp[0])]["author"]
            a["label"] = articles_list[int(temp[0])]["label"]
            a["url"] = articles_list[int(temp[0])]["url"]
            a["date_added"] = articles_list[int(temp[0])]["date_added"]
            a["content"] = articles_list[int(temp[0])]["content"]
        log(getframeinfo(currentframe()),
            'jie_ba_articles_list processing finished')

        log(getframeinfo(currentframe()), 'tf_idf_dict synthesising started')
        x = 1
        for article in jie_ba_articles_list:
            encode = []
            for word in article["segments"]:
                encode.append(tf_idf_dict[word])
            article["encoded"] = encode
            print("{0}/{1} encoded".format(x, len(jie_ba_articles_list)))
            x += 1
        log(getframeinfo(currentframe()), 'tf_idf_dict synthesising finished')

        return jie_ba_articles_list
Ejemplo n.º 11
0
def get_tf_idf() -> dict:
    frameinfo = getframeinfo(currentframe())

    with mongodb.Mongodb() as db:

        log(getframeinfo(currentframe()),
            'db.search_any("record", "the標題", "tf_idf_dict") started')
        tf_idf = db.search_any("record", "the標題", "tf_idf_dict")
        if tf_idf:
            log(getframeinfo(currentframe()),
                'db.search_any("record", "the標題", "tf_idf_dict") finished')
            return tf_idf
        log(getframeinfo(currentframe()),
            'db.search_any("record", "the標題", "tf_idf_dict") failed')

        tf_dict = {"THE總共": 0}
        idf_dict = {"THE總共": 0}
        log(getframeinfo(currentframe()),
            'db.db_all("jie_ba_Articles") started')
        jie_ba_articles_list = db.db_all("jie_ba_Articles")
        log(getframeinfo(currentframe()),
            'db.db_all("jie_ba_Articles") finished')

        temp = []

        log(getframeinfo(currentframe()),
            'jie_ba_articles_list processing started')
        for one_articles in jie_ba_articles_list:
            jie_ba_word_list = one_articles["segments"]

            for word in jie_ba_word_list:
                if word in tf_dict:  # 計算出現次數 與 總辭數
                    tf_dict[word] += 1
                else:
                    tf_dict[word] = 1
                tf_dict["THE總共"] += 1
                if word not in temp:  # 計算出現文章數
                    temp.append(word)

            for w in temp:
                if w in idf_dict:
                    idf_dict[w] += 1
                else:
                    idf_dict[w] = 1
            idf_dict["THE總共"] += 1
            temp.clear()
        log(getframeinfo(currentframe()),
            'jie_ba_articles_list processing finished')

        log(getframeinfo(currentframe()), 'idf_dict processing started')
        for i in idf_dict:
            if i != "THE總共" and i != "the標題":
                idf_dict[i] = 1 - (idf_dict[i] / (idf_dict["THE總共"] + 1))
                # idf_dict[i] = math.log10(idf_dict["THE總共"]+1 / idf_dict[i])
        log(getframeinfo(currentframe()), 'idf_dict processing finished')

        tf_idf_dict = {"the標題": "tf_idf_dict"}

        log(getframeinfo(currentframe()), 'tf_dict processing started')
        for i in tf_dict:
            if i != "THE總共" and i != "the標題":
                # tf_idf_dict[i] = math.log10(tf_dict[i] * idf_dict[i])
                tf_idf_dict[i] = math.log10(tf_dict[i]) * idf_dict[i]
        log(getframeinfo(currentframe()), 'tf_dict processing finished')

        tf_idf_list = sorted(tf_idf_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

        log(getframeinfo(currentframe()), 'tf_idf_list processing started')
        x = 1
        for key, value in tf_idf_list:
            if key != "THE總共":
                tf_idf_dict[key] = x
                x += 1
        log(getframeinfo(currentframe()), 'tf_idf_list processing finished')

        log(getframeinfo(currentframe()),
            'writing tf_idf_dick.txt to local disk started')
        crawler.json_write("tf_idf_dict.txt", tf_idf_dict)
        log(getframeinfo(currentframe()),
            'writing tf_idf_dick.txt to local disk finished')

        log(getframeinfo(currentframe()), 'cleaning db record started')
        db.db["record"].remove({"the標題": "tf_idf_dict"})
        log(getframeinfo(currentframe()), 'cleaning db record finished')

        log(getframeinfo(currentframe()), 'insert tf_idf_dict to db started')
        db.insert_one("record", tf_idf_dict)
        log(getframeinfo(currentframe()), 'insert tf_idf_dict to db finished')

        return tf_idf_dict