Esempio n. 1
0
def read_mongo(collection, *args):
    """
    param *args : list arguments
    collection   : 数据库连接对象
    args[0] -> text_len     : 数据库中每条记录长度
    args[1] -> fetch_num    : 数据库读取记录个数
    collection, text_len = 3, fetch_num = 100000
    """

    result_temp = collection.find(
        {}, {"article_id": 1, "userId": 1, "text": 1, "_id": 0})

    v_data = list()
    for index, item in enumerate(result_temp):

        if len(item["text"]) > args[0] and item:

            id_temp = item["article_id"]
            uid_temp = item["userId"]

            text_temp = data_preprocessing.extract_chinese(
                item["text"].encode("utf8"))

            v_data.append([id_temp, uid_temp, text_temp])

        if index % int(args[1] / 10.0) == 0:
            print("已经读取{}条数据".format(index))
        if index >= args[1]:
            return v_data
Esempio n. 2
0
def read_mongo_base_keyword(collection, *args):
    """
    param *args : list arguments
    collection   : 数据库连接对象
    args[0] -> text_len     : 数据库中每条记录长度
    args[1] -> keyword      : 关键词搜索
    collection, text_len = 3, keyword = "酸奶"
    """

    result_temp = collection.find({'text': {'$regex': args[1]}})

    v_data = dict()
    for index, item in enumerate(result_temp):

        if len(item["text"]) > args[0] and item:

            id_temp = item["article_id"]
            uid_temp = item["userId"]

            text_temp = data_preprocessing.extract_chinese(item["text"])
            v_data[id_temp, uid_temp] = text_temp

        if index % 10 == 0:
            print("已经读取{}条数据".format(index))
    return v_data