def get_msg_type_aggs(data_dict, date): tb = date + " 00:00:00" ta = time.strptime(tb, "%Y-%m-%d %H:%M:%S") ts = int(time.mktime(ta)) te = date + " 23:59:59" ta = time.strptime(te, "%Y-%m-%d %H:%M:%S") te = int(time.mktime(ta)) user_behavior_dict = {} for uid in data_dict: #sql = 'select * from Information where uid = %s and ' % uid #cursor.execute(sql) #cursor.fetchall() mid_dict_list = data_dict[uid] df = DataFrame(mid_dict_list) behavior_dict = get_msg_aggs(df) sensitivenum = Information.objects.filter(uid=uid, timestamp__gte=ts, timestamp__lt=te).count() behavior_dict["sensitivenum"] = sensitivenum behavior_dict["timestamp"] = ts behavior_dict["uid"] = uid behavior_dict["store_date"] = date #print(behavior_dict) user_behavior_dict["%s_%s" % (str(ts), uid)] = behavior_dict sql_insert_many(cursor, "UserBehavior", "ub_id", user_behavior_dict)
def get_user_domain(word_dict, date): #time1 = time.time() domain_dict = domain_tfidf() #time2 = time.time() #print("获取domain",time2-time1) user_domain = {} ts = date.timestamp() #thedate = datetime.date.today() domain_p = get_p(domain_dict, word_dict) #time3 = time.time() #print("获取概率",time3-time2) for k in word_dict.keys(): domain_json = json.dumps(domain_p[k]) if len(domain_p[k]): md = domain_p[k][0][0] #print(md) else: md = "other" user_domain["%s_%s" % (str(ts), k)] = { "uid": k, "timestamp": ts, "main_domain": md, "domains": domain_json, "store_date": date } sql_insert_many(cursor, "UserDomain", "ud_id", user_domain) update(user_domain)
def wordcount(text_dict, date): stopwords = stopwordslist() word_dict = {} #格式为字典{uid:{词:词频}} user_wc = {} thedate = datetime.date.today() for k, v in text_dict.items(): word_list = {} count = 0 for item in v: for item1 in item: if item1 not in stopwords and item1 != " ": count += 1 try: word_list[item1] += 1 except: word_list[item1] = 1 word_list["count"] = count word_dict[k] = word_list td = date + " 00:00:00" ta = time.strptime(td, "%Y-%m-%d %H:%M:%S") ts = int(time.mktime(ta)) for k in word_dict.keys(): word_json = json.dumps(word_dict[k]) id = "%s_%s" % (str(int(time.time())), k) user_wc[id] = { "uid": k, "timestamp": ts, "wordcount": word_json, "store_date": date } sql_insert_many("WordCount", "uwc_id", user_wc) return word_dict
def get_user_activity_aggs(data_dict, date): #end_time = int(time.time()) end_time = datetime.datetime.strptime(date + " 23:59:59", '%Y-%m-%d %H:%M:%S').timestamp() start_time = end_time - 24 * 60 * 60 user_activity_dict = {} ip_dict = {} for uid in data_dict: #geo_ip_dict = defaultdict(set) mid_dict_list = data_dict[uid] #print(mid_dict_list) df = DataFrame(mid_dict_list) df = df.astype(object).where(pd.notnull(df), None) geo_dict = df.groupby([df["geo"]]).size().to_dict() #print(geo_dict) #print(uid) '''无ip信息,后期补上 activity_dict = df.groupby([df["geo"], df["send_ip"]]).size().to_dict() for k, v in activity_dict.items(): geo_ip_dict[k[0]].add(k[1][:(k[1].rindex(".") + 1)] + "*") ''' for index, row in df.iterrows(): ip_dict[row["geo"]] = row["ip"] #print(row['geo']) #try: #ip_dict[row["geo"]] = row["ip"] #except: #continue #print(ip_dict) for k in geo_dict: #print(k) #ips = ",".join(list(geo_ip_dict[k])) #无ip信息 后期补上 ip = ip_dict[k] if ip is None: ip = "未知" #ip = geo_ip_dict[k] statusnum = geo_dict[k] #print(geo_dict[k]) sensitivenum = Information.objects.filter( uid=uid, timestamp__gte=start_time, timestamp__lt=end_time, geo=k).count() user_activity_dict["%s_%s_%s" % (str(end_time), uid, k)] = { "uid": uid, "timestamp": end_time, "geo": k, "send_ip": ip, "statusnum": statusnum, "sensitivenum": sensitivenum, "store_date": date } sql_insert_many(cursor, "UserActivity", "ua_id", user_activity_dict)
def cal_user_emotion(word_dict, thedate): ''' 用户情感计算函数 0为负面 1为中性 2为正面 :param word_dict: :return:None ''' # 加载词向量 with open('../profile_cal/sentiment_model_data/weibo_vector.pkl', 'rb') as f: weibo_dic = pickle.load(f) # 加载模型 l_m = joblib.load( '../profile_cal/sentiment_model_data/sentiment_logical.model') user_sentiment_dict = {} for uid, weibo_list in word_dict.items(): #value 为列表 key 为uid sentiment_dict = {} # thedate = today() sum_r = len(weibo_list) if sum_r: #此天有微博数据 sentiment = triple_classifier(weibo_list, weibo_dic, l_m) c = Counter(sentiment).most_common() c = dict(c) sentiment_dict['timestamp'] = date2ts(thedate) sentiment_dict['uid'] = uid sentiment_dict['negtive'] = c.get('0', 0) sentiment_dict['nuetral'] = c.get('1', 0) sentiment_dict['positive'] = c.get('2', 0) sentiment_dict['store_date'] = thedate user_sentiment_dict['%s_%s' % (str( sentiment_dict['timestamp']), uid)] = sentiment_dict else: sentiment_dict['timestamp'] = date2ts(thedate) sentiment_dict['uid'] = uid sentiment_dict['negtive'] = 0 sentiment_dict['nuetral'] = 0 sentiment_dict['positive'] = 0 sentiment_dict['store_date'] = thedate user_sentiment_dict['%s_%s' % (str( sentiment_dict['timestamp']), uid)] = sentiment_dict print("no data") sql_insert_many("UserSentiment", "us_id", user_sentiment_dict)
def get_user_topic(word_dict, date): #time1 = time.time() topic_dict = topic_tfidf() #time2 = time.time() #print("读取topic花费:",time2-time1) #thedate = datetime.date.today() #td = date + " 00:00:00" #ta = time.strptime(td, "%Y-%m-%d %H:%M:%S") #ts = int(time.mktime(ta)) #print(topic_dict) ts = date.timestamp() user_topic = {} topic_p = get_p(topic_dict, word_dict) #time3 = time.time() #print("获取概率花费:",time3-time1) for k in word_dict.keys(): topic_json = json.dumps(topic_p[k]) user_topic["%s_%s" % (str(ts), k)] = { "uid": k, "timestamp": ts, "topics": topic_json, "store_date": date } sql_insert_many(cursor, "UserTopic", "ut_id", user_topic)
def get_user_keywords(text_list,word_dict,date, keywords_num=5): keywords = [] hastag_dict=defaultdict(list) user_kw={} keywords_dict=defaultdict(dict) text_all="" #thedate = datetime.date.today() tr4w = TextRank4Keyword() #time11 = time.time() td = date + " 00:00:00" ta = time.strptime(td, "%Y-%m-%d %H:%M:%S") ts = int(time.mktime(ta)) for k,v in text_list.items(): hastag = {} for text in v: if isinstance(text, str): RE = re.compile(r'#([a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)#', re.UNICODE) #print(RE.findall(text)) #RE = re.compile(u"#.[\u4e00-\u9fa5]+#") #print(text) ht = RE.findall(text.encode('utf-8').decode('utf-8')) if len(ht): for h in ht: if h in hastag: hastag[h] += 1 else: hastag[h] = 1 tr4w.analyze(text=text, lower=True, window=2) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 for item in tr4w.get_keywords(keywords_num, word_min_len= 1): #print(item.word,item.weight) try: keywords_dict[k][item['word']] += item['weight'] except: keywords_dict[k][item['word']] = item['weight'] #print(json.dumps(keywords_dict[k],ensure_ascii=False)) hastag_dict[k] = hastag #print(hastag) #keywords_dict[k] = keywords #print(hastag_dict) #time22 = time.time() #print("获取关键词和has花费:",time22-time11) #time2 = time.time() #print("wordcount花费:",time2-time22) sensitive_words_weight = sensitive_word() #time3=time.time() #print("读取敏感词花费:",time3-time2) stw_dict = get_p(sensitive_words_weight,word_dict) #time4 = time.time() #print("获取概率:",time4-time3) for k in word_dict: #if len(keywords_dict): keyword_json = json.dumps(keywords_dict[k],ensure_ascii=False) #print(keyword_json) #if len(hastag_dict): hastag_json = json.dumps(hastag_dict[k],ensure_ascii=False) #if len(stw_dict): stw_json = json.dumps(stw_dict[k],ensure_ascii=False) user_kw["%s_%s" % (str(ts), k)]={"uid": k, "timestamp": ts, "keywords":keyword_json, "hastags":hastag_json, "sensitive_words":stw_json, "store_date":date} sql_insert_many(cursor, "UserKeyWord", "ukw_id", user_kw) #time5 = time.time() # print("插入kw花费:",time5-time4) #return keywords_dict,hastag_dict'''