Esempio n. 1
0
def run_result():
    with op.DBManger() as m:
        pipeline = [{
            "$match": {
                "_status": 0
            },
        }]
        price_dic = {}
        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdprice20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdprice(20\d\d\d\d\d\d)$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdprice20201209_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdprice(20210129)$"
                }}):
            if not last_sep or table > last_sep:
                print("step 1: processing {}".format(table), flush=True)
                for item in m.read_from(db_collect=("jingdong", table),
                                        pipeline=pipeline):
                    if int(item["id"]) in price_dic:
                        tmp = price_dic[int(item["id"])]
                        tmp["prices"] = (tmp["prices"][0] + 1,
                                         tmp["prices"][1] + clean_price(item))
                    else:
                        price_dic[int(item["id"])] = {
                            "prices": (1, clean_price(item))
                        }
        for skuid in price_dic:
            tmp = price_dic[int(skuid)]
            tmp["clean_price"] = round(tmp["prices"][1] / tmp["prices"][0], 2)
            tmp.pop("prices")
        result_dic = price_dic

        #skuids in last result
        last_month_skuids = {}
        last_result = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^month20\d\d\d\d$"
            }})
        print("step 2: processing {}".format(last_result), flush=True)
        last_month = last_result[-6:]
        for skuid, comments, price, cate_id, brand_id, ziying in m.read_from(
                db_collect=("jingdong", last_result),
                out_field=("skuid", "comments", "clean_price", "cate_id",
                           "brand_id", "ziying")):
            if cate_id:
                last_month_skuids[int(skuid)] = {
                    "clean_price": price,
                    "comments": comments,
                    "cate_id": format_cat_id(cate_id),
                    "brand_id": brand_id,
                    "ziying": ziying
                }

        skuid_sukid_dict = {}
        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdskuid20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdskuid(20\d\d\d\d\d\d)retry\d*$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdskuid20201214_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdskuid(20210108)retry\d*$"
                }}):
            if not last_sep or table > last_sep:
                print("step 3: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {
                            "_status": 0
                        }
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "cate_id": "$cate_id",
                            "brand_id": "$brand_id",
                            "ziying": "$ziying",
                        }
                    },
                ]
                for skuid, cate_id, brand_id, ziying in m.read_from(
                        db_collect=("jingdong", table),
                        out_field=("skuid", "cate_id", "brand_id", "ziying"),
                        pipeline=pipeline):
                    skuid_sukid_dict[int(skuid)] = {
                        "cate_id": cate_id,
                        "brand_id": "0" if brand_id is None else brand_id,
                        "ziying": ziying
                    }

        #last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdcomment20\d\d\d\d\d\d_sep"}})
        #for table in m.list_tables(dbname="jingdong",filter={"name": {"$regex": r"^jdcomment(20\d\d\d\d\d\d)retry\d*$"}}):
        last_sep = m.get_lasted_collection(
            "jingdong", filter={"name": {
                "$regex": r"^jdcomment20201218_sep"
            }})
        for table in m.list_tables(
                dbname="jingdong",
                filter={"name": {
                    "$regex": r"^jdcomment(20210302)retry\d*$"
                }}):
            if not last_sep or table > last_sep:
                print("step 4: processing {}".format(table), flush=True)
                pipeline = [
                    {
                        "$match": {
                            #"$and": [{"_status": 0}, {"comment": {"$gt": 0}}]
                            "$and": [{
                                "_status": 0
                            }, {
                                "comment": {
                                    "$gt": "0"
                                }
                            }]
                        }
                    },
                    {
                        "$project": {
                            "skuid": "$skuid",
                            "comment": "$comment",
                        }
                    },
                ]
                for skuid, comments in m.read_from_yield(
                        db_collect=("jingdong", table),
                        out_field=("skuid", "comment"),
                        pipeline=pipeline):
                    if int(skuid) in skuid_sukid_dict:
                        if int(skuid) in price_dic:
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = price_dic[int(
                                skuid)]["clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 0
                        elif int(skuid) in last_month_skuids:
                            last_month_price_item = last_month_skuids[int(
                                skuid)]
                            if int(skuid) not in result_dic:
                                result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = last_month_price_item[
                                "clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 1
                        else:
                            result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = 79.90
                            price_item["comments"] = int(comments)
                            price_item["type"] = 2
                        skuid_sukid_item = skuid_sukid_dict[int(skuid)]
                        price_item["cate_id"] = skuid_sukid_item["cate_id"]
                        price_item["brand_id"] = skuid_sukid_item["brand_id"]
                        price_item["ziying"] = skuid_sukid_item["ziying"]
                    elif int(skuid) in last_month_skuids:
                        if int(skuid) in price_dic:
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = price_dic[int(
                                skuid)]["clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 3
                        elif int(skuid) in last_month_skuids:
                            last_month_price_item = last_month_skuids[int(
                                skuid)]
                            if int(skuid) not in result_dic:
                                result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = last_month_price_item[
                                "clean_price"]
                            price_item["comments"] = int(comments)
                            price_item["type"] = 4
                        else:
                            result_dic[int(skuid)] = {}
                            price_item = result_dic[int(skuid)]
                            price_item["clean_price"] = 79.90
                            price_item["comments"] = int(comments)
                            price_item["type"] = 5
                        last_month_skuids_item = last_month_skuids[int(skuid)]
                        price_item["cate_id"] = last_month_skuids_item[
                            "cate_id"]
                        price_item["brand_id"] = last_month_skuids_item[
                            "brand_id"]
                        price_item["ziying"] = last_month_skuids_item["ziying"]
                    else:
                        result_dic[int(skuid)] = {}
                        price_item = result_dic[int(skuid)]
                        price_item["clean_price"] = 79.90
                        price_item["comments"] = int(comments)
                        price_item["cate_id"] = "0,0,0"
                        price_item["brand_id"] = "0"
                        price_item["ziying"] = "-1"
                        price_item["type"] = 6
        print(
            "step 5: processing skuid in last_month_skuids but not in result_dic",
            flush=True)
        for skuid in last_month_skuids:
            if int(skuid) not in result_dic:
                result_dic[int(skuid)] = {}
                price_item = result_dic[int(skuid)]
                price_item["clean_price"] = last_month_skuids[skuid][
                    "clean_price"]
                price_item["comments"] = last_month_skuids[skuid]["comments"]
                price_item["cate_id"] = "0,0,0"
                price_item["brand_id"] = "0"
                price_item["ziying"] = "-1"
                price_item["type"] = 7
            else:
                price_item = result_dic[int(skuid)]
                if 'type' not in price_item:
                    price_item["clean_price"] = last_month_skuids[skuid][
                        "clean_price"]
                    price_item["comments"] = last_month_skuids[skuid][
                        "comments"]
                    price_item["cate_id"] = last_month_skuids[skuid]["cate_id"]
                    price_item["brand_id"] = last_month_skuids[skuid][
                        "brand_id"]
                    price_item["ziying"] = last_month_skuids[skuid]["ziying"]
                    price_item["type"] = 8

        this_month = timeUtil.get_month(deltamonth=1, current_month=last_month)
        out_table = "month" + this_month
        print("step 6: processing writing result to {}".format(out_table),
              flush=True)
        buffer = []
        buffer_size = 5000
        print("result_dic:{}".format(len(result_dic)), flush=True)
        for i, k in enumerate(result_dic):
            result_dic[k]["skuid"] = k
            if "prices" in result_dic[k]:
                result_dic[k].pop("prices")
            result_dic[k]["month"] = this_month
            if "cate_id" in result_dic[k]:
                buffer.append(result_dic[k])
            else:
                print(result_dic[k])
            if i % buffer_size == 0 and buffer:
                m.insert_many_dict(db_collect=("jingdong", out_table),
                                   data_dict_list=buffer)
                buffer = []
        if buffer:
            m.insert_many_dict(db_collect=("jingdong", out_table),
                               data_dict_list=buffer)
        m.create_db_collection(
            db_collection=("jingdong", "jdprice{0}_sep".format(current_date)))
Esempio n. 2
0
            enumerate(m.read_from_yield(db_collect=("jingdong",
                                                    last_summary)))):
        if item["skuid"] in this_month:
            this_item = this_month.pop(item["skuid"])
            item["comment_{}".format(month)] = this_item["comments"]
            item["price"] = this_item["clean_price"]
            item["ziying"] = this_item["ziying"]
            bid = this_item["brand_id"]
            if bid:
                item["brand_id"] = bid
            cate_id = this_item["cate_id"]
            if cate_id:
                item["cate_id"] = cate_id
        else:
            item["comment_{}".format(month)] = item["comment_{}".format(
                timeUtil.get_month(-1, current_month=month))]
        list.append(item)
        if i % buffer_size == 0:
            m.insert_many_dict(db_collect=("jingdong",
                                           "summary_201905_{}".format(month)),
                               data_dict_list=list)
            list = []
if list:
    m.insert_many_dict(db_collect=("jingdong",
                                   "summary_201905_{}".format(month)),
                       data_dict_list=list)
list = []
print("step3...", flush=True)
for i, skuid in tqdm(enumerate(this_month)):
    this_item = this_month[skuid]
    item = {}
Esempio n. 3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from mongo import op
from multiprocess.tools import timeUtil
current_date = "20200821"
current_month = current_date[:-2]
last_1_month, last_2_month, last_3_month = timeUtil.get_month(
    -1, current_month), timeUtil.get_month(-2,
                                           current_month), timeUtil.get_month(
                                               -3, current_month),
comment_table = "secoComment{}".format(current_date)

with op.DBManger() as m:
    for month in [last_1_month]:
        # 合并属于一个月的List
        m.drop_db_collect(db_collect=("secoo", "List{}".format(month)))
        dic = {}
        for listday in m.list_tables(
                dbname="secoo",
                filter={"name": {
                    "$regex": r"List{}\d\d$".format(month)
                }}):
            print(listday, "List{}".format(month))
            for item in m.read_from(db_collect=("secoo", listday),
                                    out_field=("pid", "price", "self")):
                dic.update({item[0]: (item[1], item[2])})
        date_tuple_list = []
        for k, (p, s) in dic.items():
            date_tuple_list.append((k, k, p, s))
        m.insert_many_tupe(db_collect=("secoo", "List{}".format(month)),
                           data_tupe_list=date_tuple_list,
Esempio n. 4
0
 def compute_result(self):
     from mongo import op
     from multiprocess.tools import timeUtil
     current_date = self.current_date
     current_month = current_date[:-2]
     last_1_month, last_2_month, last_3_month = timeUtil.get_month(
         -1, current_month), timeUtil.get_month(
             -2, current_month), timeUtil.get_month(-3, current_month),
     comment_table = "secoComment{}".format(current_date)
     with op.DBManger() as m:
         for month in [last_1_month, last_2_month, last_3_month]:
             # 合并属于一个月的List
             m.drop_db_collect(db_collect=("secoo", "List{}".format(month)))
             dic = {}
             for listday in m.list_tables(
                     dbname="secoo",
                     filter={
                         "name": {
                             "$regex": r"List{}\d\d$".format(month)
                         }
                     }):
                 print(listday, "List{}".format(month))
                 for item in m.read_from(db_collect=("secoo", listday),
                                         out_field=("pid", "price",
                                                    "self")):
                     dic.update({item[0]: (item[1], item[2])})
             date_tuple_list = []
             for k, (p, s) in dic.items():
                 date_tuple_list.append((k, k, p, s))
             m.insert_many_tupe(db_collect=("secoo",
                                            "List{}".format(month)),
                                data_tupe_list=date_tuple_list,
                                fields=("_id", "pid", "price", "self"))
             # 有销量
             pipeline1 = [{
                 "$match": {
                     "$and": [{
                         "_status": 0
                     }, {
                         "pid": {
                             "$ne": None
                         }
                     }]
                 }
             }, {
                 "$project": {
                     "cid": "$cid",
                     "pid_rel": "$pid_rel",
                     "pid": "$pid",
                     "user": "******",
                     "device": "$device",
                     "price": "$price",
                     "date": "$date",
                     "month": {
                         "$substr": ["$date", 0, 6]
                     },
                     "self": "$self",
                 }
             }, {
                 "$match": {
                     "month": "{}".format(month)
                 }
             }, {
                 "$lookup": {
                     "from": "CleanListNew",
                     "localField": "pid",
                     "foreignField": "_id",
                     "as": "tableb"
                 }
             }, {
                 "$group": {
                     "_id": {
                         "month": "$month",
                         "cid": "$cid",
                         "pid": "$pid",
                         "pid_rel": "$pid_rel",
                     },
                     "user": {
                         "$last": "$user",
                     },
                     "device": {
                         "$last": "$device",
                     },
                     "price": {
                         "$last": "$price",
                     },
                     "tmp_price": {
                         "$last": {
                             "$arrayElemAt": ["$tableb.price", 0]
                         }
                     },
                     "tmp_self": {
                         "$last": {
                             "$arrayElemAt": ["$tableb.self", 0]
                         }
                     },
                 },
             }, {
                 "$project": {
                     "_id": 0,
                     "month": "$_id.month",
                     "cid": "$_id.cid",
                     "pid_rel": "$_id.pid_rel",
                     "pid": "$_id.pid",
                     "user": "******",
                     "device": "$device",
                     "price": {
                         "$cond": {
                             "if": {
                                 "$ne": ["$tmp_price", None]
                             },
                             "then": "$tmp_price",
                             "else": "$price"
                         }
                     },
                     "tmp_self": "$tmp_self",
                 }
             }, {
                 "$lookup": {
                     "from": "CleanListNew",
                     "localField": "pid_rel",
                     "foreignField": "_id",
                     "as": "tablec"
                 }
             }, {
                 "$project": {
                     "_id": 0,
                     "month": "$month",
                     "cid": "$cid",
                     "pid_rel": "$pid_rel",
                     "pid": "$pid",
                     "user": "******",
                     "device": "$device",
                     "price": "$price",
                     "tmp_self": "$tmp_self",
                     "tmp_self1": {
                         "$arrayElemAt": ["$tablec.self", 0]
                     },
                 }
             }, {
                 "$project": {
                     "_id": 0,
                     "month": "$month",
                     "cid": "$cid",
                     "pid_rel": "$pid_rel",
                     "pid": "$pid",
                     "user": "******",
                     "device": "$device",
                     "price": "$price",
                     "self": {
                         "$cond": {
                             "if": {
                                 "$ne": ["$tmp_self", None]
                             },
                             "then": "$tmp_self",
                             "else": {
                                 "if": {
                                     "$ne": ["$tmp_self1", None]
                                 },
                                 "then": "$tmp_self1",
                                 "else": "其他"
                             }
                         }
                     },
                 }
             }, {
                 "$group": {
                     "_id": {
                         "month": "$month",
                         "cid": "$cid",
                         "pid": "$pid",
                         "price": "$price",
                     },
                     "self": {
                         "$last": "$self"
                     }
                 },
             }, {
                 "$group": {
                     "_id": {
                         "month": "$_id.month",
                         "pid": "$_id.pid",
                         "price": "$_id.price",
                     },
                     "sales": {
                         "$sum": 1
                     },
                     "self": {
                         "$last": "$self"
                     },
                 },
             }, {
                 "$project": {
                     "_id": 0,
                     "month": "$_id.month",
                     "pid": "$_id.pid",
                     "sales": "$sales",
                     "price": "$_id.price",
                     "self": {
                         "$cond": {
                             "if": {
                                 "$ne": ["$self", "自营"]
                             },
                             "then": "0",
                             "else": "1"
                         }
                     },
                 }
             }, {
                 "$out": "secoSales{}".format(month)
             }]
             # 无销量
             pipeline2 = [{
                 "$match": {
                     "$and": [{
                         "_status": {
                             "$ne": 0
                         }
                     }, {
                         "_seed": {
                             "$ne": None
                         }
                     }]
                 }
             }, {
                 "$project": {
                     "pid_rel": {
                         "$arrayElemAt": ["$_seed", 0]
                     },
                     "price": {
                         "$arrayElemAt": ["$_seed", 1]
                     },
                 }
             }, {
                 "$lookup": {
                     "from": "List{}".format(month),
                     "localField": "pid_rel",
                     "foreignField": "_id",
                     "as": "tableb"
                 }
             }, {
                 "$project": {
                     "pid_rel": "$pid_rel",
                     "price": "$price",
                     "self": {
                         "$arrayElemAt": ["$tableb.self", 0]
                     },
                 }
             }, {
                 "$match": {
                     "self": {
                         "$exists": True
                     }
                 }
             }, {
                 "$group": {
                     "_id": {
                         "pid_rel": "$pid_rel",
                         "price": "$price",
                     },
                     "self": {
                         "$last": "$self"
                     },
                 },
             }, {
                 "$project": {
                     "_id": 0,
                     "month": "{}".format(month),
                     "pid": "$_id.pid_rel",
                     "sales": "0",
                     "price": "$_id.price",
                     "self": {
                         "$cond": {
                             "if": {
                                 "$ne": ["$self", "自营"]
                             },
                             "then": "0",
                             "else": "1"
                         }
                     },
                 }
             }, {
                 "$out": "secoNosales{}".format(month)
             }]
             m.aggregate(db_collect=("secoo", comment_table),
                         pipeline=pipeline1)
             m.aggregate(db_collect=("secoo", comment_table),
                         pipeline=pipeline2)
             dic = {}
             for item in m.read_from(
                     db_collect=("secoo", "secoNosales{}".format(month)),
                     out_field=("pid", "price", "sales", "self")):
                 dic.update({item[0]: (item[1], item[2], item[3])})
             for item in m.read_from(
                     db_collect=("secoo", "secoSales{}".format(month)),
                     out_field=("pid", "price", "sales", "self")):
                 dic.update({item[0]: (item[1], item[2], item[3])})
             date_tuple_list = []
             for k, (p, s, self) in dic.items():
                 date_tuple_list.append((k, k, p, s, self))
             m.drop_db_collect(db_collect=("secoo",
                                           "secoResult{}".format(month)))
             m.insert_many_tupe(db_collect=("secoo",
                                            "secoResult{}".format(month)),
                                data_tupe_list=date_tuple_list,
                                fields=("_id", "pid", "price", "sales",
                                        "self"))