def scanMostPositiveApps(order=-1,limit=50): results = MongoUtil.sort("emotion_comment","wilson_lower_score",order = order,limit = limit) for result in results: appid = result["appid"] appinfo = MongoUtil.find_one("app_table",{"_id":appid}) appinfo["comment_count"] = result["comment_count"] appinfo["pos_count"] = result["pos_count"] appinfo["neg_count"] = result["neg_count"] appinfo["applause_rate"] = result["applause_rate"] # appinfo["wilson_top_score"] = result["neg_count"] appinfo["wilson_lower_score"] = result["wilson_lower_score"] print(appinfo) print()
def saveAppToDB(appinfo): post = {} post["catagory"]=appinfo.cata post["appname"]=appinfo.name # post["installnum"]=appinfo.installnum post["url"]=appinfo.url post["descripe"]=appinfo.descripe post["apk"]=appinfo.apk post["date"]=time.strftime('%Y-%m-%d',time.localtime(time.time())) # print(post) if not MongoUtil.isExist("app_table", {"catagory":appinfo.cata, "appname":appinfo.name}): MongoUtil.insert("app_table", post) print(appinfo.cata + appinfo.name)
def createDex(): MongoUtil.create_index("app_table", "appid", False) MongoUtil.create_index("word_table", "word", False) catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) for cataname in catas: cataname = cataname.strip() MongoUtil.create_index(cataname, "appid", False) MongoUtil.create_index(cataname, "wordid", False)
def scanMostFastGrownApps(order=-1,limit=50,capacity_limit = 10000,date = "2017-01-23"): capacity_low_limit = 10000 results = MongoUtil.sort_with_values("capacity_rate_table",{"date":date},"incre_rate",order = order) for result in results: limit -=1 appid = result["appid"] appinfo = MongoUtil.find_one("app_table",{"_id":appid}) capacityinfo = MongoUtil.find_one("capacity_table",{"appid":appid,"date":date}) if capacityinfo is None or capacityinfo["capacity_num"] < capacity_low_limit: continue appinfo["incre_rate"] = result["incre_rate"] appinfo["wilson_lower_rate"] = result["wilson_lower_rate"] print(appinfo) print() if limit <=0 : break
def saveAllComentEmotionData(): begin = False model,best_words = load_model() catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) for cataname in catas: apps = MongoUtil.find("app_table",{"catagory":cataname}) code = 0 for app in apps: code+=1 print(code,end=" ") if MongoUtil.isExist("emotion_comment",{"appid":appid}): print(appname+"已经存在了") break saveCommentEmotionData(model,best_words,app)
def scanAppInfo(appname,catagory=""): if catagory=="": apps = MongoUtil.find("app_table", {"appname":appname}) else: apps = MongoUtil.find("app_table", {"catagory":catagory, "appname":appname}) for appinfo in apps: print("基本信息: ") print(appinfo) catagory = appinfo["catagory"] dir = "../file/apps_detail_descripe/"+catagory+"/"+appinfo["appname"]+".json" if os.path.exists(dir): f = open(dir) print("应用描述:") print(f.read()) print()
def getRecommendInfo(appinfo, date): recommend_info = {} capacity_info = MongoUtil.find_one("capacity_table", { "appid": appinfo["_id"], "date": date }) if capacity_info is None: # print(appinfo["appname"],end=" 1\n") return None capacity_rate_info = MongoUtil.find_one("capacity_rate_table", { "appid": appinfo["_id"], "date": date }) if capacity_rate_info is None: # print(appinfo["appname"],end=" 2\n") return None comment_info = MongoUtil.find_one("emotion_comment", {"appid": appinfo["_id"]}) if comment_info is None: # print(appinfo["appname"],end=" 3\n") return None try: recommend_info["appname"] = appinfo["appname"] recommend_info["catagory"] = appinfo["catagory"] recommend_info["appid"] = appinfo["_id"] recommend_info["capacity"] = capacity_info["capacity_num"] recommend_info["date"] = date recommend_info["capacity_rate"] = capacity_rate_info["incre_rate"] recommend_info["comment_wilson_lower_score"] = comment_info[ "wilson_lower_score"] recommend_info["comment_count"] = comment_info["comment_count"] recommend_info["recommend_score"] = ( getLastCapacityNormalization(recommend_info["capacity_rate"]) * last_capacity_rate_param + getCapacityNormalization(recommend_info["capacity"]) * capacity_param + getApplauseNormalization( recommend_info["comment_wilson_lower_score"]) * applause_param + getCommentCountNormalization(recommend_info["comment_count"]) * comment_count_param + correct(recommend_info)) except: print("-->" + recommend_info["appname"]) return recommend_info
def __init__(self, appname, cataname=""): self.tf_idfdict = None if cataname == "": self.app = MongoUtil.find_one("app_table", {"appname": appname}) else: self.app = MongoUtil.find_one("app_table", { "catagory": cataname, "appname": appname }) if self.app is None: print("该app未存储在数据库,可能原因:查询不准确,未存储入数据库,数据未更新") print(self.app) self.worddict, self.wordcount = self.frequencyscore() if self.wordcount < 100: print("该app的评论数量过少,获取关键词将会不准确") return print("评论总数是:" + str(self.wordcount)) self.tf_idfdict = self.tf_idf()
def saveRecommendApps(date): apps = MongoUtil.find("app_table", {}) recommendApps = [] tem = [] for app in apps: tem.append(app) for app in tem: recommend_info = getRecommendInfo(app, date) if recommend_info is None: continue if MongoUtil.isExist("recommend_table", { "appid": app["_id"], "date": date }): print(date + " " + app["appname"] + " 已经存在") continue print(app["appname"]) recommendApps.append(recommend_info) MongoUtil.upsert_mary("recommend_table", recommendApps)
def get_app_each_comment(appname,cataname =""): if cataname == "": app = MongoUtil.find_one("app_table", {"appname":appname}) else: app = MongoUtil.find_one("app_table", {"catagory":cataname, "appname":appname}) print(app) if app is None: return app_id = app["_id"] app_cata = app["catagory"] results = MongoUtil.find(app_cata,{"appid":app_id}) comments = {} for item in results: word_id = item["wordid"] location = item["location"] word = MongoUtil.find_one("word_table",{"_id":word_id})["word"] comments.setdefault(location,[]) comments[location].append(word) return comments
def savetoDB(appid,comment_count,pos_count,neg_count): if comment_count < useful_comment_threshold: print("总的有效评论数量:"+str(comment_count)+" 好评数量:"+str(pos_count)+" 差评数量:"+str(neg_count)) print("该app的评论数小于100,无参考意义") print() return applause_rate = (float)(pos_count / comment_count) top_score,lower_score = WilsonScoreUtil.confidence(pos_count,neg_count) print("总的有效评论数量:"+str(comment_count)+" 好评数量:"+str(pos_count)+" 差评数量:"+str(neg_count)+" 好评率:"+str(applause_rate)) print() MongoUtil.save("emotion_comment",{ "appid":appid, "comment_count":comment_count, "pos_count":pos_count, "neg_count":neg_count, "applause_rate":applause_rate, "wilson_top_score" :top_score, "wilson_lower_score":lower_score })
def tf_idf(self): if self.worddict == None or len(self.worddict) == 0: print("请初始化词频统计") return if self.wordcount < 100: print("该app的评论数量过少,获取关键词将会不准确") return #文档总数 docu_count = len( MongoUtil.distinct_count(self.app["catagory"], "appid", value=None)) #减去它本身 docu_count -= 1 tf_idfdict = {} for item in self.worddict.items(): result = MongoUtil.find_one("word_table", {"word": item[0]}) wordid = result["_id"] include_count = len( MongoUtil.distinct_count(self.app["catagory"], "appid", value={"wordid": wordid})) #减去它本身 include_count -= 1 # print(item[0]+"->"+str(item[1])+" 包含的总文档数"+str(include_count)) # print(str(docu_count) + " "+str(include_count)) if docu_count <= 0: docu_count = 0 wordidf = float(math.log(docu_count / (include_count + 1))) wordtf = float(item[1] / self.wordcount) tf_idfdict[item[0]] = wordtf * wordidf for item in tf_idfdict.items(): print(item[0] + " 出现的次数:" + str(self.worddict[item[0]]) + " tf-idf计算值:" + str(item[1])) return tf_idfdict
def scanCatagorys(): catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) print("所有目录信息:") code = 0 for cataname in catas: code+=1 cataname = cataname.strip() print(str(code)+". "+cataname,end=" ") scanCatagoryInfo(cataname) print() count = MongoUtil.find("app_table",{}).count() print("总数:" + str(count))
def delivery_words(appid,content): # 去除乱码 content = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', content) # 使用全模式 seglist = jieba.cut(content,cut_all=False) for word in seglist: if word not in stopWords and word not in punctuations and word != '\n' and word!=' ' and not word.isdigit(): post_word = {} post_word["word"]=word if not MongoUtil.isExist("word_table", post_word): MongoUtil.insert("word_table", post_word) result = MongoUtil.find_one("word_table", post_word) wordid = result['_id'] if wordid==None: print(post_word) post_location ={} post_location["appid"]=appid post_location["wordid"]=wordid posts.append(post_location)
def getRecommendApps(limit=10, date="2016-12-08"): recommendApps = MongoUtil.sort_with_values("recommend_table", {"date": date}, "recommend_score", limit=limit, order=-1) # recommendAppsToShow = recommendApps[:limit] # apps = [] # for app in recommendApps: # apps.append(app) # print(app) # return apps return recommendApps
def getChainRateStore(appinfo): appid = appinfo["_id"] appname = appinfo["appname"] cataname = appinfo["catagory"] begin_date, end_date, app_incre = getChainIncreRateCapacity( appname, cataname) if app_incre is None: # print(appname) file = open("../file/not_exist/not_exist_apps", "a") is_not_exist.append(appid) file.write(cataname + " ") file.write(appname + "\n") file.close() return capacitys = getDowloadCapacity(appname, cataname=cataname) # print(appname) for incre in app_incre.items(): post = {} post["appid"] = appid post["date"] = incre[0] post["incre_rate"] = incre[1] capacity = capacitys[incre[0]] # print(incre[1],capacity) if incre[1] < 0: return if incre[1] <= 0: post["wilson_lower_rate"] = -WilsonScoreUtil.confidence_2( -incre[1], capacity) else: post["wilson_lower_rate"] = WilsonScoreUtil.confidence_2( incre[1], capacity) # print(post) posts.append(post) MongoUtil.upsert_mary("capacity_rate_table", posts)
def scan_cata_app(cata): posts.clear() results = MongoUtil.find("app_table",{"catagory":cata}) code = 0 apps = [] for item in results: apps.append(item) for app in apps: code+=1 posts.clear() print(code,end=" ") print(app["appname"]) if MongoUtil.isExist("app_detail_descripe",{"appid":app["_id"]}): continue content = read_descripe(cata,app["appname"]) if content is not None: delivery_words(app["_id"],content) print(len(posts)) # print(posts) print() if(len(posts) > 0): MongoUtil.upsert_mary("app_detail_descripe",posts)
def deliveryWords(appinfo,filename): print(appinfo.name) contents = [line.strip() for line in open(filename)] wordlist = [] line_num = 0 result = MongoUtil.find_one("app_table", {"catagory":appinfo.cata, "appname":appinfo.name}) if result==None: print("\""+appinfo.cata+" "+appinfo.name+"\" 未存入数据库中,请先存储") return appid = result['_id'] result = MongoUtil.find_one(appinfo.cata, {"appid":appid}) # result = MongoUtil.find_one("wordlocation_table",{"appid":appid}) if result!=None: print("\""+appinfo.cata+" "+appinfo.name+"\" 已经分词存入数据库,不必重复") return for line in contents: time.sleep(0.1) line_num+=1 # 去除乱码 line = re.compile('[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f]').sub(' ', line) # 使用全模式 seglist = jieba.cut(line,cut_all=False) wordlist.append(seglist) for word in seglist: if word not in stopWords and word not in punctuations and word != '\n' and word!=' ' and not word.isdigit(): # print(word,end=",") post_word = {} post_word["word"]=word if not MongoUtil.isExist("word_table", post_word): MongoUtil.insert("word_table", post_word) result = MongoUtil.find_one("word_table", post_word) wordid = result['_id'] if wordid==None: print(post_word) post_location ={} post_location["appid"]=appid post_location["wordid"]=wordid post_location["location"]=line_num MongoUtil.insert(appinfo.cata, post_location)
def getCataAppsInfo(filename): print(filename) date, count = readInfoFile(filename) catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) for cataname in catas: cataname = cataname.strip() catafilename = filename + "/" + cataname + ".json" apps = json.load(open(catafilename)) for app in apps.items(): name = app[0].strip().replace("/", " ") # capacity = app time.sleep(0.2) post = {"catagory": cataname, "appname": name} result = MongoUtil.find_one("app_table", post) print("存入的app" + name) if result == None: print(cataname + "->" + name + " ->该app未存入数据库") app_not_exist.append(app) else: appid = result["_id"] capacity = app[1]["install"] # capacity = install2num(capacity) saveAppCapacityToDB(appid, date, capacity)
def saveAppCapacityToDB(appid, date, capacity): post = {"appid": appid, "date": date} if not MongoUtil.isExist("capacity_table", post): post["capacity"] = capacity MongoUtil.save("capacity_table", post)
def getCapacityCount(date): return MongoUtil.find("capacity_table", {"date": date}).count()
#{'appid': ObjectId('58648f1282939b10b3d46b88'), 'wilson_lower_score': 0.3208923096194997, 'comment_count': 499, 'neg_count': 254, 'applause_rate': 0.31956521739130433, 'pos_count': 147, 'wilson_top_score': 0.4148067884968993} #将文本转化为安装数量 def install2num(install): result = (float)(re.findall(r"\d+\.?\d*",install)[0]) if result==0: return 0 if '亿' in install: result*=100000000 if '万' in install: result*=10000 return int(result) allApps = MongoUtil.find("capacity_table",{}) datas = [] code = 0 for appinfo in allApps: code += 1 _id = appinfo["_id"] appid = appinfo["appid"] date = appinfo["date"] capacity = appinfo["capacity"] capacity_num = install2num(capacity) data = {"_id":_id,"appid":appid,"date":date,"capacity":capacity, "capacity_num":capacity_num} print(data) datas.append(data)
return if incre[1] <= 0: post["wilson_lower_rate"] = -WilsonScoreUtil.confidence_2( -incre[1], capacity) else: post["wilson_lower_rate"] = WilsonScoreUtil.confidence_2( incre[1], capacity) # print(post) posts.append(post) MongoUtil.upsert_mary("capacity_rate_table", posts) if __name__ == '__main__': #聊天社交 catas = json.load(open(const.WANDOUJIA_CATA_JSON_FILE)) for cata in catas: # cata = "生活服务" posts.clear() is_not_exist.clear() if cata in []: continue print("目录:" + cata) appinfo_list = MongoUtil.find("app_table", {"catagory": cata}) for appinfo in appinfo_list: getChainRateStore(appinfo) print(len(posts)) pickle.dump(is_not_exist, open('../file/not_exist/not_exist_appid', 'wb'))
def getMaxDownLoadCapacity(limit=10): results = MongoUtil.capacity_find_most(limit) return results
def deleteAppDieveryWord(cataname,appname): id = MongoUtil.find_one("app_table", {"appname":appname})["_id"] result = MongoUtil.remove(cataname, {"appid":id}) print("已从“"+cataname+"”数据库中删除“"+appname+"”应用的分词信息")
def scanCatagoryInfo(catagory): count = MongoUtil.find("app_table",{"catagory":catagory}).count() print("app数量: "+str(count))