def build(self):
        items = read("factItem")()
        items = items[items["brand"] == items["brand"]]
        macro_condition = load("statsAllSubMacroCondition")

        def sum_sold(df):
            sold_sum = df["total_sold_price"].sum()
            df["sold_sum"] = sold_sum
            try:
                df["sold_share"] = sold_sum / macro_condition["total"]
            except ZeroDivisionError:
                df["sold_share"] = 0
            return df

        items["total_sold_price"] = items["total_sold_price"].fillna(0)
        items = items.groupby(["brand"]).apply(sum_sold).drop_duplicates(
            ["brand"]).sort_values("sold_sum", ascending=False)
        items = items[["brand", "sold_sum", "sold_share"]]

        prev, rank = 0, 0
        for k, v in items.iterrows():
            if prev != v["sold_sum"]:
                rank += 1
                prev = v["sold_sum"]
            items.at[k, "rank"] = rank

        super().dump(items, "statsTopSoldBrands")
Esempio n. 2
0
    def cut(self):
        df = read("factItem")()
        title = "".join(list(df['title']))
        cut_word = jieba.analyse.textrank(title, topK=self.threshold["title"])
        title_words = set()
        for i, word in enumerate(list(cut_word)):
            title_words.add(word)
        dump(title_words, "titleHotWords")

        df = read("hotWords")()
        title = "".join(list(df['hotwords']))
        cut_word = jieba.analyse.textrank(title,
                                          topK=self.threshold["hot search"])
        search_words = set()
        for i, word in enumerate(list(cut_word)):
            search_words.add(word)
        dump(search_words, "searchHotWords")
    def statistic(self):
        if Mode.statsLOCAL:
            try:
                load("statsTopSoldBrand")
            except FileNotFoundError:
                print(
                    "StatisticTopSoldBrandMethod: Don't Have Local Result Files"
                )
            else:
                return

        items = read("factItem")()
        items = items[items["brand"] == items["brand"]]
        macro_condition = load("statsAllSubMacroCondition")

        def sum_sold(df):
            sold_sum = df["total_sold_price"].sum()
            df["sold_sum"] = sold_sum
            try:
                df["sold_share"] = sold_sum / macro_condition["total"]
            except ZeroDivisionError:
                df["sold_share"] = 0
            return df

        items["total_sold_price"] = items["total_sold_price"].fillna(0)
        items = items.groupby(["brand"]).apply(sum_sold).drop_duplicates(
            ["brand"]).sort_values("sold_sum", ascending=False)
        items = items[["brand", "sold_sum", "sold_share"]]

        prev, rank = 0, 0
        for k, v in items.iterrows():
            if prev != v["sold_sum"]:
                rank += 1
                prev = v["sold_sum"]
            items.at[k, "rank"] = rank
        dump(items, "statsTopSoldBrands")
    def statistic(self):
        if Mode.statsLOCAL:
            try:
                load("statsAllSubItemidSet")
                load("statsAllSubBrandSet")
                load("statsAllSubSellerSet")
                load("statsAllSubBiz30day")
                load("statsAllSubTotalSoldPrice")
                load("statsAllSubSoldAverPrice")

                load("statsAllSubMacroCondition")
                load("statsAllItemidMapping")

                load("statsAllSubBiz30dayShare")
                load("statsAllSubBiz30dayRank")
                load("statsAllSubBiz30dayReRank")
                load("statsAllSubTotalSoldPriceShare")
                load("statsAllSubTotalSoldPriceRank")
                load("statsAllSubTotalSoldPriceReRank")
            except FileNotFoundError:
                print(
                    "StatisticAllSubmarketMethod: Don't Have Local Result Files"
                )
            else:
                return

        words = load("submarketWords")
        items = read("factItem")()

        itemid_set = dict()
        brand_set = dict()
        seller_set = dict()
        biz30day = {word: 0 for word in words}
        total_sold_price = {word: 0 for word in words}

        macro_conditions = dict()
        macro_conditions["biz30day"] = 0
        macro_conditions["total"] = 0

        itemid_mapping = dict()

        for k, v in items.iterrows():
            if k % 100 == 0:
                print("process", k, "/", len(items))
            for word in words:
                if word in v["title"]:
                    itemid_set.setdefault(word, set()).add(v["itemid"])
                    brand_set.setdefault(word, set()).add(v["brand"])
                    seller_set.setdefault(word, set()).add(v["sellernick"])
                    biz30day[word] += v["biz30day"]
                    total_sold_price[word] += v["total_sold_price"]

            macro_conditions.setdefault("itemid", set()).add(v["itemid"])
            macro_conditions.setdefault("brand", set()).add(v["brand"])
            macro_conditions.setdefault("seller", set()).add(v["sellernick"])
            macro_conditions["biz30day"] += v["biz30day"]
            macro_conditions["total"] += v["total_sold_price"]

            if v["itemid"] in itemid_mapping.keys():
                print("Warning: Duplicate Itemid", v["itemid"])
            itemid_mapping[v["itemid"]] = (v["brand"], v["biz30day"],
                                           v["total_sold_price"])

        sold_price_aver = dict()
        for word in words:
            try:
                sold_price_aver[word] = \
                    round(total_sold_price[word] / biz30day[word], 2)
            except ZeroDivisionError:
                sold_price_aver[word] = 0

        try:
            macro_conditions["aver"] = \
                round(macro_conditions["total"] / macro_conditions["biz30day"], 2)
        except ZeroDivisionError:
            macro_conditions["aver"] = 0

        # itemid_set["macro conditions"] = macro_conditions["itemid"]
        # brand_set["macro conditions"] = macro_conditions["brand"]
        # seller_set["macro conditions"] = macro_conditions["seller"]
        # biz30day["macro conditions"] = macro_conditions["biz30day"]
        # total_sold_price["macro conditions"] = macro_conditions["total"]
        # sold_price_aver["macro conditions"] = macro_conditions["aver"]

        dump(itemid_set, "statsAllSubItemidSet")
        dump(brand_set, "statsAllSubBrandSet")
        dump(seller_set, "statsAllSubSellerSet")
        dump(biz30day, "statsAllSubBiz30day")
        dump(total_sold_price, "statsAllSubTotalSoldPrice")
        dump(sold_price_aver, "statsAllSubSoldAverPrice")

        dump(macro_conditions, "statsAllSubMacroCondition")
        dump(itemid_mapping, "statsAllItemidMapping")

        biz30day_share = dict()
        biz30day_rank = dict()
        biz30day_rerank = dict()
        total_sold_price_share = dict()
        total_sold_price_rank = dict()
        total_sold_price_rerank = dict()

        items = sorted(biz30day.items(), key=lambda x: x[1], reverse=True)
        prev, rank = 0, 0
        for key, value in items:
            try:
                biz30day_share[key] = value / macro_conditions["biz30day"]
            except ZeroDivisionError:
                biz30day_share[key] = 0
            if prev != value:
                rank += 1
                prev = value
            biz30day_rank[key] = rank
            biz30day_rerank.setdefault(rank, list()).append(key)

        items = sorted(total_sold_price.items(),
                       key=lambda x: x[1],
                       reverse=True)
        prev, rank = 0, 0
        for key, value in items:
            try:
                total_sold_price_share[key] = value / macro_conditions["total"]
            except ZeroDivisionError:
                total_sold_price_share[key] = 0
            if prev != value:
                rank += 1
                prev = value
            total_sold_price_rank[key] = rank
            total_sold_price_rerank.setdefault(rank, list()).append(key)

        dump(biz30day_share, "statsAllSubBiz30dayShare")
        dump(biz30day_rank, "statsAllSubBiz30dayRank")
        dump(biz30day_rerank, "statsAllSubBiz30dayReRank")
        dump(total_sold_price_share, "statsAllSubTotalSoldPriceShare")
        dump(total_sold_price_rank, "statsAllSubTotalSoldPriceRank")
        dump(total_sold_price_rerank, "statsAllSubTotalSoldPriceReRank")