Beispiel #1
0
def scanMostPositiveApps(order=-1,limit=50):
    results = MongoUtil.sort("emotion_comment","wilson_lower_score",order = order,limit = limit)
    for result in results:
        appid = result["appid"]
        appinfo = MongoUtil.find_one("app_table",{"_id":appid})
        appinfo["comment_count"] = result["comment_count"]
        appinfo["pos_count"] = result["pos_count"]
        appinfo["neg_count"] = result["neg_count"]
        appinfo["applause_rate"] = result["applause_rate"]
        # appinfo["wilson_top_score"] = result["neg_count"]
        appinfo["wilson_lower_score"] = result["wilson_lower_score"]
        print(appinfo)
        print()
Beispiel #2
0
def saveAppToDB(appinfo):
    post = {}
    post["catagory"]=appinfo.cata
    post["appname"]=appinfo.name
    # post["installnum"]=appinfo.installnum
    post["url"]=appinfo.url
    post["descripe"]=appinfo.descripe
    post["apk"]=appinfo.apk
    post["date"]=time.strftime('%Y-%m-%d',time.localtime(time.time()))
    # print(post)
    if not MongoUtil.isExist("app_table", {"catagory":appinfo.cata, "appname":appinfo.name}):
        MongoUtil.insert("app_table", post)
    print(appinfo.cata + appinfo.name)
Beispiel #3
0
def createDex():
    MongoUtil.create_index("app_table", "appid", False)
    MongoUtil.create_index("word_table", "word", False)
    catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE))
    for cataname in catas:
        cataname = cataname.strip()
        MongoUtil.create_index(cataname, "appid", False)
        MongoUtil.create_index(cataname, "wordid", False)
Beispiel #4
0
def scanMostFastGrownApps(order=-1,limit=50,capacity_limit = 10000,date = "2017-01-23"):
    capacity_low_limit = 10000
    results = MongoUtil.sort_with_values("capacity_rate_table",{"date":date},"incre_rate",order = order)
    for result in results:
        limit -=1
        appid = result["appid"]
        appinfo = MongoUtil.find_one("app_table",{"_id":appid})
        capacityinfo = MongoUtil.find_one("capacity_table",{"appid":appid,"date":date})
        if capacityinfo is None or capacityinfo["capacity_num"] < capacity_low_limit:
            continue
        appinfo["incre_rate"] = result["incre_rate"]
        appinfo["wilson_lower_rate"] = result["wilson_lower_rate"]
        print(appinfo)
        print()
        if limit <=0 :
            break
Beispiel #5
0
def saveAllComentEmotionData():

    begin = False
    model,best_words = load_model()
    catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE))
    for cataname in catas:

        apps = MongoUtil.find("app_table",{"catagory":cataname})
        code = 0
        for app in apps:
            code+=1
            print(code,end=" ")
            if MongoUtil.isExist("emotion_comment",{"appid":appid}):
                print(appname+"已经存在了")
                break
            saveCommentEmotionData(model,best_words,app)
Beispiel #6
0
def scanAppInfo(appname,catagory=""):
    if catagory=="":
        apps = MongoUtil.find("app_table", {"appname":appname})
    else:
        apps = MongoUtil.find("app_table", {"catagory":catagory, "appname":appname})

    for appinfo in apps:
        print("基本信息: ")
        print(appinfo)
        catagory = appinfo["catagory"]
        dir = "../file/apps_detail_descripe/"+catagory+"/"+appinfo["appname"]+".json"

        if os.path.exists(dir):
            f = open(dir)
            print("应用描述:")
            print(f.read())
        print()
Beispiel #7
0
def getRecommendInfo(appinfo, date):
    recommend_info = {}

    capacity_info = MongoUtil.find_one("capacity_table", {
        "appid": appinfo["_id"],
        "date": date
    })
    if capacity_info is None:
        # print(appinfo["appname"],end=" 1\n")
        return None
    capacity_rate_info = MongoUtil.find_one("capacity_rate_table", {
        "appid": appinfo["_id"],
        "date": date
    })
    if capacity_rate_info is None:
        # print(appinfo["appname"],end=" 2\n")
        return None
    comment_info = MongoUtil.find_one("emotion_comment",
                                      {"appid": appinfo["_id"]})
    if comment_info is None:
        # print(appinfo["appname"],end=" 3\n")
        return None

    try:
        recommend_info["appname"] = appinfo["appname"]
        recommend_info["catagory"] = appinfo["catagory"]
        recommend_info["appid"] = appinfo["_id"]
        recommend_info["capacity"] = capacity_info["capacity_num"]
        recommend_info["date"] = date
        recommend_info["capacity_rate"] = capacity_rate_info["incre_rate"]
        recommend_info["comment_wilson_lower_score"] = comment_info[
            "wilson_lower_score"]
        recommend_info["comment_count"] = comment_info["comment_count"]
        recommend_info["recommend_score"] = (
            getLastCapacityNormalization(recommend_info["capacity_rate"]) *
            last_capacity_rate_param +
            getCapacityNormalization(recommend_info["capacity"]) *
            capacity_param + getApplauseNormalization(
                recommend_info["comment_wilson_lower_score"]) * applause_param
            + getCommentCountNormalization(recommend_info["comment_count"]) *
            comment_count_param + correct(recommend_info))
    except:
        print("-->" + recommend_info["appname"])

    return recommend_info
Beispiel #8
0
    def __init__(self, appname, cataname=""):
        self.tf_idfdict = None
        if cataname == "":
            self.app = MongoUtil.find_one("app_table", {"appname": appname})
        else:
            self.app = MongoUtil.find_one("app_table", {
                "catagory": cataname,
                "appname": appname
            })

        if self.app is None:
            print("该app未存储在数据库,可能原因:查询不准确,未存储入数据库,数据未更新")
        print(self.app)
        self.worddict, self.wordcount = self.frequencyscore()
        if self.wordcount < 100:
            print("该app的评论数量过少,获取关键词将会不准确")
            return
        print("评论总数是:" + str(self.wordcount))
        self.tf_idfdict = self.tf_idf()
Beispiel #9
0
def saveRecommendApps(date):
    apps = MongoUtil.find("app_table", {})
    recommendApps = []
    tem = []
    for app in apps:
        tem.append(app)
    for app in tem:
        recommend_info = getRecommendInfo(app, date)
        if recommend_info is None:
            continue
        if MongoUtil.isExist("recommend_table", {
                "appid": app["_id"],
                "date": date
        }):
            print(date + " " + app["appname"] + " 已经存在")
            continue
        print(app["appname"])
        recommendApps.append(recommend_info)
    MongoUtil.upsert_mary("recommend_table", recommendApps)
Beispiel #10
0
def get_app_each_comment(appname,cataname =""):
    if cataname == "":
        app = MongoUtil.find_one("app_table", {"appname":appname})
    else:
        app = MongoUtil.find_one("app_table", {"catagory":cataname, "appname":appname})
    print(app)
    if app is None:
        return
    app_id = app["_id"]
    app_cata = app["catagory"]
    results = MongoUtil.find(app_cata,{"appid":app_id})
    comments = {}

    for item in results:
        word_id = item["wordid"]
        location = item["location"]
        word = MongoUtil.find_one("word_table",{"_id":word_id})["word"]
        comments.setdefault(location,[])
        comments[location].append(word)
    return comments
Beispiel #11
0
def savetoDB(appid,comment_count,pos_count,neg_count):

    if comment_count < useful_comment_threshold:
        print("总的有效评论数量:"+str(comment_count)+" 好评数量:"+str(pos_count)+" 差评数量:"+str(neg_count))
        print("该app的评论数小于100,无参考意义")
        print()
        return

    applause_rate = (float)(pos_count / comment_count)
    top_score,lower_score = WilsonScoreUtil.confidence(pos_count,neg_count)
    print("总的有效评论数量:"+str(comment_count)+" 好评数量:"+str(pos_count)+" 差评数量:"+str(neg_count)+" 好评率:"+str(applause_rate))
    print()
    MongoUtil.save("emotion_comment",{ "appid":appid,
                                       "comment_count":comment_count,
                                       "pos_count":pos_count,
                                       "neg_count":neg_count,
                                       "applause_rate":applause_rate,
                                       "wilson_top_score" :top_score,
                                       "wilson_lower_score":lower_score
                                      })
Beispiel #12
0
    def tf_idf(self):

        if self.worddict == None or len(self.worddict) == 0:
            print("请初始化词频统计")
            return
        if self.wordcount < 100:
            print("该app的评论数量过少,获取关键词将会不准确")
            return

        #文档总数
        docu_count = len(
            MongoUtil.distinct_count(self.app["catagory"], "appid",
                                     value=None))
        #减去它本身
        docu_count -= 1

        tf_idfdict = {}
        for item in self.worddict.items():
            result = MongoUtil.find_one("word_table", {"word": item[0]})
            wordid = result["_id"]
            include_count = len(
                MongoUtil.distinct_count(self.app["catagory"],
                                         "appid",
                                         value={"wordid": wordid}))
            #减去它本身
            include_count -= 1

            # print(item[0]+"->"+str(item[1])+"  包含的总文档数"+str(include_count))
            # print(str(docu_count) + " "+str(include_count))
            if docu_count <= 0:
                docu_count = 0

            wordidf = float(math.log(docu_count / (include_count + 1)))
            wordtf = float(item[1] / self.wordcount)
            tf_idfdict[item[0]] = wordtf * wordidf

        for item in tf_idfdict.items():
            print(item[0] + "    出现的次数:" + str(self.worddict[item[0]]) +
                  "     tf-idf计算值:" + str(item[1]))

        return tf_idfdict
Beispiel #13
0
def scanCatagorys():
    catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE))
    print("所有目录信息:")
    code = 0
    for cataname in catas:
        code+=1
        cataname = cataname.strip()
        print(str(code)+". "+cataname,end=" ")
        scanCatagoryInfo(cataname)
    print()
    count = MongoUtil.find("app_table",{}).count()
    print("总数:" + str(count))
def delivery_words(appid,content):
    # 去除乱码
    content = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', content)
    # 使用全模式
    seglist = jieba.cut(content,cut_all=False)
    for word in seglist:
        if word not in stopWords and word not in punctuations and word != '\n' and word!=' ' and not word.isdigit():
            post_word = {}
            post_word["word"]=word
            if not MongoUtil.isExist("word_table", post_word):
                MongoUtil.insert("word_table", post_word)

            result = MongoUtil.find_one("word_table", post_word)

            wordid = result['_id']
            if wordid==None:
                print(post_word)

            post_location ={}
            post_location["appid"]=appid
            post_location["wordid"]=wordid
            posts.append(post_location)
Beispiel #15
0
def getRecommendApps(limit=10, date="2016-12-08"):
    recommendApps = MongoUtil.sort_with_values("recommend_table",
                                               {"date": date},
                                               "recommend_score",
                                               limit=limit,
                                               order=-1)
    # recommendAppsToShow = recommendApps[:limit]
    # apps = []
    # for app in recommendApps:
    #     apps.append(app)
    #     print(app)
    # return apps
    return recommendApps
Beispiel #16
0
def getChainRateStore(appinfo):
    appid = appinfo["_id"]
    appname = appinfo["appname"]
    cataname = appinfo["catagory"]
    begin_date, end_date, app_incre = getChainIncreRateCapacity(
        appname, cataname)
    if app_incre is None:
        # print(appname)
        file = open("../file/not_exist/not_exist_apps", "a")
        is_not_exist.append(appid)
        file.write(cataname + " ")
        file.write(appname + "\n")
        file.close()
        return

    capacitys = getDowloadCapacity(appname, cataname=cataname)
    # print(appname)
    for incre in app_incre.items():
        post = {}
        post["appid"] = appid
        post["date"] = incre[0]
        post["incre_rate"] = incre[1]
        capacity = capacitys[incre[0]]
        # print(incre[1],capacity)

        if incre[1] < 0:
            return

        if incre[1] <= 0:
            post["wilson_lower_rate"] = -WilsonScoreUtil.confidence_2(
                -incre[1], capacity)
        else:
            post["wilson_lower_rate"] = WilsonScoreUtil.confidence_2(
                incre[1], capacity)
        # print(post)
        posts.append(post)
    MongoUtil.upsert_mary("capacity_rate_table", posts)
def scan_cata_app(cata):
    posts.clear()
    results = MongoUtil.find("app_table",{"catagory":cata})
    code = 0
    apps = []
    for item in results:
        apps.append(item)
    for app in apps:
        code+=1
        posts.clear()

        print(code,end=" ")
        print(app["appname"])

        if MongoUtil.isExist("app_detail_descripe",{"appid":app["_id"]}):
            continue
        content = read_descripe(cata,app["appname"])
        if content is not None:
            delivery_words(app["_id"],content)
        print(len(posts))
        # print(posts)
        print()
        if(len(posts) > 0):
            MongoUtil.upsert_mary("app_detail_descripe",posts)
Beispiel #18
0
def deliveryWords(appinfo,filename):
    print(appinfo.name)
    contents = [line.strip() for line in open(filename)]
    wordlist = []
    line_num = 0
    result = MongoUtil.find_one("app_table", {"catagory":appinfo.cata, "appname":appinfo.name})
    if result==None:
        print("\""+appinfo.cata+" "+appinfo.name+"\" 未存入数据库中,请先存储")
        return
    appid = result['_id']

    result = MongoUtil.find_one(appinfo.cata, {"appid":appid})
    # result = MongoUtil.find_one("wordlocation_table",{"appid":appid})
    if result!=None:
        print("\""+appinfo.cata+" "+appinfo.name+"\" 已经分词存入数据库,不必重复")
        return

    for line in contents:
        time.sleep(0.1)
        line_num+=1
        # 去除乱码
        line = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', line)
        # 使用全模式
        seglist = jieba.cut(line,cut_all=False)
        wordlist.append(seglist)
        for word in seglist:
            if word not in stopWords and word not in punctuations and word != '\n' and word!=' ' and not word.isdigit():
                # print(word,end=",")
                post_word = {}
                post_word["word"]=word
                if not MongoUtil.isExist("word_table", post_word):
                    MongoUtil.insert("word_table", post_word)

                result = MongoUtil.find_one("word_table", post_word)

                wordid = result['_id']
                if wordid==None:
                    print(post_word)
                post_location ={}
                post_location["appid"]=appid
                post_location["wordid"]=wordid
                post_location["location"]=line_num
                MongoUtil.insert(appinfo.cata, post_location)
Beispiel #19
0
def getCataAppsInfo(filename):
    print(filename)
    date, count = readInfoFile(filename)
    catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE))
    for cataname in catas:
        cataname = cataname.strip()
        catafilename = filename + "/" + cataname + ".json"
        apps = json.load(open(catafilename))
        for app in apps.items():
            name = app[0].strip().replace("/", " ")
            # capacity = app
            time.sleep(0.2)
            post = {"catagory": cataname, "appname": name}
            result = MongoUtil.find_one("app_table", post)
            print("存入的app" + name)
            if result == None:
                print(cataname + "->" + name + "   ->该app未存入数据库")
                app_not_exist.append(app)
            else:
                appid = result["_id"]
                capacity = app[1]["install"]
                # capacity = install2num(capacity)
                saveAppCapacityToDB(appid, date, capacity)
Beispiel #20
0
def saveAppCapacityToDB(appid, date, capacity):
    post = {"appid": appid, "date": date}
    if not MongoUtil.isExist("capacity_table", post):
        post["capacity"] = capacity
        MongoUtil.save("capacity_table", post)
Beispiel #21
0
def getCapacityCount(date):
    return MongoUtil.find("capacity_table", {"date": date}).count()
Beispiel #22
0
#{'appid': ObjectId('58648f1282939b10b3d46b88'), 'wilson_lower_score': 0.3208923096194997, 'comment_count': 499, 'neg_count': 254, 'applause_rate': 0.31956521739130433, 'pos_count': 147, 'wilson_top_score': 0.4148067884968993}


#将文本转化为安装数量
def install2num(install):
    result = (float)(re.findall(r"\d+\.?\d*",install)[0])
    if result==0:
        return 0
    if '亿' in install:
        result*=100000000
    if '万' in install:
        result*=10000
    return int(result)

allApps = MongoUtil.find("capacity_table",{})

datas = []
code = 0
for appinfo in allApps:
    code += 1
    _id = appinfo["_id"]
    appid = appinfo["appid"]
    date = appinfo["date"]
    capacity = appinfo["capacity"]
    capacity_num = install2num(capacity)

    data = {"_id":_id,"appid":appid,"date":date,"capacity":capacity, "capacity_num":capacity_num}
    print(data)
    datas.append(data)
Beispiel #23
0
            return

        if incre[1] <= 0:
            post["wilson_lower_rate"] = -WilsonScoreUtil.confidence_2(
                -incre[1], capacity)
        else:
            post["wilson_lower_rate"] = WilsonScoreUtil.confidence_2(
                incre[1], capacity)
        # print(post)
        posts.append(post)
    MongoUtil.upsert_mary("capacity_rate_table", posts)


if __name__ == '__main__':
    #聊天社交
    catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE))
    for cata in catas:
        #     cata = "生活服务"
        posts.clear()
        is_not_exist.clear()
        if cata in []:
            continue

        print("目录:" + cata)
        appinfo_list = MongoUtil.find("app_table", {"catagory": cata})
        for appinfo in appinfo_list:
            getChainRateStore(appinfo)

        print(len(posts))
        pickle.dump(is_not_exist,
                    open('../file/not_exist/not_exist_appid', 'wb'))
Beispiel #24
0
def getMaxDownLoadCapacity(limit=10):
    results = MongoUtil.capacity_find_most(limit)
    return results
Beispiel #25
0
def deleteAppDieveryWord(cataname,appname):
    id = MongoUtil.find_one("app_table", {"appname":appname})["_id"]
    result = MongoUtil.remove(cataname, {"appid":id})
    print("已从“"+cataname+"”数据库中删除“"+appname+"”应用的分词信息")
Beispiel #26
0
def scanCatagoryInfo(catagory):
    count = MongoUtil.find("app_table",{"catagory":catagory}).count()
    print("app数量: "+str(count))