def coll_sentence_hash(): logger_9965.info("Begin to collect sentence...") exist_set = get_exist_nids() limit = 10000 offset = 10000 pool = Pool(30) while True: conn, cursor = get_postgredb_query() cursor.execute(cal_sql2, (ignore_cname, limit, offset)) rows = cursor.fetchall() conn.close() offset += limit if len(rows) == 0: break all_set = set() for r in rows: all_set.add(r[0]) need_to_cal_set = all_set - exist_set if len(need_to_cal_set) == 0: continue same_dict = get_relate_same_news(need_to_cal_set) pool.apply_async(cal_process, args=(need_to_cal_set, None, 3, 3, same_dict)) #相同的阈值为3; 取2天内的新闻 pool.close() pool.join() logger_9965.info("Congratulations! Finish to collect sentences.")
def check_del_sub_subject(sub_id1, sub_id2): nids1 = set() nids2 = set() conn, cursor = get_postgredb_query() cursor.execute(check_sql, (sub_id1, )) rows = cursor.fetchall() for r in rows: nids1.add(r[0]) cursor.execute(check_sql, (sub_id2, )) rows = cursor.fetchall() for r in rows: nids2.add(r[0]) cursor.close() conn.close() if (nids1 | nids2 == nids1) or (nids1 | nids2 == nids2): #nids存在包含关系 delete_id = sub_id1 if len(nids1) < len(nids2) else sub_id2 logger_sub.info('delete subject {}.'.format(delete_id)) logger_sub.info(' {} :{}'.format(sub_id1, nids1)) logger_sub.info(' {} :{}'.format(sub_id2, nids2)) data = {'id':delete_id} requests.post(delete_sub_url, data=data, cookies=cookie) return delete_id return None
def random_predict_nids(): sql = "select nid from newslist_v2 nv inner join channellist_v2 cl on nv.chid=cl.id where cl.cname in %s order by nid desc limit 50" conn, cursor = doc_process.get_postgredb_query() #print cursor.mogrify(sql, (tuple(chnl_newsnum_dict.keys()),)) cursor.execute(sql, (tuple(chnl_newsnum_dict.keys()), )) rows = cursor.fetchall() conn.close() return kmeans_predict(list(rows))
def get_exist_nids(): conn, cursor = get_postgredb_query() cursor.execute(s_nid_sql) rows = cursor.fetchall() nid_set = set() for r in rows: nid_set.add(r[0]) conn.close() return nid_set
def get_chname_id_dict(): global chname_id_dict chname_id_sql = "select id, cname from channellist_v2" conn, cursor = doc_process.get_postgredb_query() cursor.execute(chname_id_sql) rows = cursor.fetchall() for r in rows: chname_id_dict[r[1]] = r[0] cursor.close() conn.close()
def get_newest_topic_v(): topic_sql = "select model_v from user_topics_v2 group by model_v" conn, cursor = get_postgredb_query() cursor.execute(topic_sql) rows = cursor.fetchall() topic_vs = [] for row in rows: topic_vs.append(row[0]) conn.close() return max(topic_vs)
def add_cover_to_sub(): conn, cursor = get_postgredb_query() cover_sql = "select id from topiclist where type=1 and cover=''" cursor.execute(cover_sql) rows = cursor.fetchall() for r in rows: data = {'id': r[0], 'cover': subject_cover} requests.put(modify_url, data=data, cookies=cookie) cursor.close() conn.close()
def get_active_user_info(min_interval=1, min_click=1): ''' 获取活跃用户信息 :param min_interval: 多少天内有点击行为的才是活跃用户 :param min_click: 用户最少点击量 :return: ''' nt = datetime.datetime.now() t = nt.strftime('%Y-%m-%d %H:%M:%S') #获取活跃用户 user_sql = "select uid from newsrecommendclick " \ "where ctime > to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') - interval '{} day' " \ "group by uid HAVING \"count\"(*)>={}" conn, cursor = get_postgredb_query() cursor.execute(user_sql.format(t, min_interval, min_click)) rows = cursor.fetchall() active_users = [r[0] for r in rows] #先获取用户特征 user_device_sql = "select uid, brand,device_size,network,ctype,province,city,area " \ "from user_device " \ "where uid in ({})" user_raw_info = dict() cursor.execute( user_device_sql.format(','.join(str(u) for u in active_users))) user_raw = cursor.fetchall() for u in user_raw: user_raw_info[u[0]] = [u[1], u[2], u[3], u[4], u[5], u[6], u[7]] #获取用户活跃时间段及点击的新闻 user_time_sql = "select nid, ctime from newsrecommendclick " \ "where uid={} and " \ "ctime > to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') - interval '{} day' " user_click_dict = dict() for u in user_raw_info.keys(): hour_dict = dict() #记录每个小时的点击数 cursor.execute(user_time_sql.format(u, t, min_interval)) rows = cursor.fetchall() user_click_dict[u] = [] for r in rows: h = r[1].hour hour_dict[h] = 1 if h not in hour_dict else hour_dict[h] + 1 user_click_dict[u].append(r[0]) user_raw_info[u].append(hour_dict.keys()) user_csv = pd.Series(user_raw_info).to_csv('user_feature.csv') user_click_csv = pd.Series(user_click_dict).to_csv('user_click.csv') print 'finished!!' #取负样本 cursor.close() conn.close()
def get(self): try: print '-------------deal old clicks begin!-------------' from graphlab_lda import topic_model_model s = "select uid, nid, ctime from newsrecommendclick where ctime > now() - interval '30 day'" from util import doc_process conn, cursor = doc_process.get_postgredb_query() cursor.execute(s) rows = cursor.fetchall() clicks = tuple(rows) topic_model_model.predict_clicks(clicks) print '-------------deal old clicks finish!-------------' except: traceback.print_exc()
def deal_old_nids(nid_list): try: s = 'select nid from news_topic_v2 where nid in {}' conn, cursor = get_postgredb_query() cursor.execute(s.format(tuple(nid_list))) rows = cursor.fetchall() nid_set = set(nid_list) exist_set = set(rows) to_deal_set = nid_set - exist_set predict_nids(list(to_deal_set)) conn.close() except: conn.close() raise
def get_old_news(interval=2.0): old_news_sql = "select ns.nid, hash_val from news_simhash ns " \ "inner join newslist_v2 nv on ns.nid=nv.nid " \ "where (ns.ctime > now() - interval '{0} day') and nv.state=0 " \ "and nv.chid != 44" conn, cursor = doc_process.get_postgredb_query() cursor.execute(old_news_sql.format(interval)) rows = cursor.fetchall() nids_hash_dict = dict() for r in rows: nids_hash_dict[r[0]] = long(r[1]) cursor.close() conn.close() return nids_hash_dict
def choose_subject_name(name_list): #首先去除已经存在的专题名 check_exist_sql = "select id from topiclist where " \ "create_time > now() - interval '7 day' and " \ "type = 1 and name=%s" conn, cursor = get_postgredb_query() #for name in name_list: for n in name_list: logger_sub.info(' before name_list {}'.format(n)) i = 0 while i < len(name_list): cursor.execute(check_exist_sql, (name_list[i], )) row = cursor.fetchone() if row: logger_sub.info(' {} remove {}'.format(row[0], name_list[i])) name_list.remove(name_list[i]) continue i += 1 conn.close() #logger_sub.info('after name_list: {}'.format(name_list)) if len(name_list) == 0: logger_sub.info('all invalid!!!!') #raise ValueError('all subject names have existed!') return None for n in name_list: logger_sub.info(' after name_list {}'.format(n)) word_doc_freq = dict() #词的 name_ws = [] name_num = len(name_list) for name in name_list: ws = set(cut_pos_ltp(name, return_str=False)) name_ws.append(ws) for w in ws: if w in word_doc_freq: word_doc_freq[w] += 1 else: word_doc_freq[w] = 1 words_matter = [] for item in word_doc_freq.items(): if item[1] > name_num / 2: words_matter.append(item[0]) words_matter_ratio = [] for name in name_ws: name_matter = name & set(words_matter) words_matter_ratio.append(len(name_matter) / float(len(name))) index, value = max(enumerate(words_matter_ratio), key=operator.itemgetter(1)) return name_list[index]
def get_hashval(): sql = "select nid, hash_val from news_simhash where ctime > now() - interval '2 day'" conn, cursor = get_postgredb_query() cursor.execute(sql) rows = cursor.fetchall() t0 = datetime.datetime.now() print 'compare with {}'.format(len(rows)) hashval = 3255685376439667788 same = [] for r in rows: if simhash.dif_bit(hashval, long(r[1])) <= 12: same.append(r[0]) t1 = datetime.datetime.now() print len(same) print 'it takes {} sec'.format((t1 - t0).total_seconds())
def get_news_interval(h, interval = 9999): ''' 找到一定时间内可能重复的新闻 :param h: :param interval: :return: ''' fir, sec, thi, fou, fir2, sec2, thi2, fou2 = get_4_segments(h.__long__()) conn, cursor = doc_process.get_postgredb_query() cursor.execute(hash_sql.format(interval, fir, sec, thi, fou, fir2, sec2, thi2, fou2)) rows = cursor.fetchall() nid_hv_list = [] for r in rows: nid_hv_list.append((r[0], r[1])) conn.close() return nid_hv_list
def coll_news_proc(save_dir, chnl, doc_num_per_chnl, csv_path): try: logger.info(' start to collect {} ......'.format(chnl)) #f = open(os.path.join(save_dir, chnl), 'w') #定义频道文件 conn, cursor = doc_process.get_postgredb_query() if chnl in channel_for_topic_dict.keys(): num = channel_for_topic_dict[chnl] else: num = doc_num_per_chnl logger.info(' {} num is {}'.format(chnl, num)) cursor.execute(channle_sql, (chnl, num)) logger.info(' finish to query {} '. format(chnl)) rows = cursor.fetchall() print len(rows) df = pd.DataFrame(columns=csv_columns) for row in rows: title = row[0] content_list = row[1] txt = '' for content in content_list: if 'txt' in content.keys(): txt += content['txt'].encode('utf-8') total_txt = title*3 + txt data = {'nid':[row[2]], 'doc':[''.join(total_txt.split())]} #split主要去除回车符\r, 否则pandas.read_csv出错 df_local = pd.DataFrame(data, columns=csv_columns) df = df.append(df_local, ignore_index=True) ''' total_list = doc_process.filter_html_stopwords_pos(total_txt, remove_num=True, remove_single_word=True) if len(total_list) < doc_min_len: #字数太少则丢弃 continue #根据tfidf进行二次筛选 total_list = doc_process.jieba_extract_keywords(' '.join(total_list), min(50, len(total_list)/5)) for w in total_list: f.write(w.encode('utf-8') + ' ') f.write('\n') #f.write(' '.join(total_list).encode('utf-8') + '\n') del content_list ''' df.to_csv(csv_path, index=False) cursor.close() conn.close() #f.close() logger.info(' finished to collect {} ......'.format(chnl)) except: traceback.print_exc() logger.exception(traceback.format_exc())
def get_relate_same_news(nid_set): if len(nid_set) == 0: return dict() conn, cursor = get_postgredb_query() nid_tuple = tuple(nid_set) cursor.execute(same_sql, (nid_tuple, nid_tuple)) same_dict = {} rows = cursor.fetchall() for r in rows: if r[0] not in same_dict.keys(): same_dict[r[0]] = [] if r[1] not in same_dict.keys(): same_dict[r[1]] = [] same_dict[r[0]].append(r[1]) same_dict[r[1]].append(r[0]) conn.close() return same_dict
def is_sentence_ads(hash_val, fir_16, sec_16, thi_16, fou_16, fir2_16, sec2_16, thi2_16, fou2_16, pname): conn, cursor = get_postgredb_query() cursor.execute(check_ads_sql, (fir_16, sec_16, thi_16, fou_16, fir2_16, sec2_16, thi2_16, fou2_16)) rows = cursor.fetchall() for r in rows: if hash_val.hamming_distance_with_val(long(r[1])) <= 3: exist = False if r[2]: spnames = r[2].split(',') if len(spnames) == 0 or (pname in spnames): exist = True else: exist = True if exist: conn.close() return True conn.close() return False
def update_sub_name_on_nids(sub_id, nids): conn, cursor = get_postgredb_query() sql = "select title from newslist_v2 where nid in ({}) and nid not in (select news from topicnews where topic={})" nid_str = ', '.join(str(i) for i in nids) cursor.execute(sql.format(nid_str, sub_id)) rows = cursor.fetchall() cursor.close() conn.close() logger_sub.info(' choose from {}'.format(nids)) for r in rows: logger_sub.info(' choose from {}'.format(r[0])) mod_name = choose_subject_name([r[0] for r in rows]) if not mod_name: return data = {'id': sub_id, 'name': mod_name} respond = requests.put(modify_url, data=data, cookies=cookie) logger_sub.info('response: {}'.format(respond.content)) logger_sub.info('update {} sub name to {}'.format(sub_id, mod_name))
def del_nid_of_fewer_comment(nid, n, log=logger): try: conn, cursor = doc_process.get_postgredb_query() #先判断新闻是否已经被手工推荐。有则删除没有被手工推荐的新闻 cursor.execute(recommend_sql, (nid, n)) rs = cursor.fetchall() if len(rs) == 1: #一个被手工上线 for r in rs: rnid = r[0] if rnid == n: del_nid = nid stay_nid = n else: del_nid = n stay_nid = nid #cursor.execute(offonline_sql.format(del_nid)) #conn.commit() data = {} data['nid'] = del_nid response = requests.post(url, data=data) cursor.close() conn.close() log.info('{0} has been recommended, so offline {1}'.format(stay_nid, del_nid)) return del_nid cursor.execute(get_comment_num_sql.format(nid, n)) rows = cursor.fetchall() nid_goal = [] for r in rows: nid_goal.append((r[0], goal_to_del(r[2], r[1]))) #计算两篇新闻的得分 if len(nid_goal) == 0: #查库失败, 直接删除旧新闻 return n sorted_goal = sorted(nid_goal, key=lambda goal:goal[1]) del_nid = sorted_goal[0][0] data = {} data['nid'] = del_nid response = requests.post(url, data=data) cursor.close() conn.close() log.info('{0} vs {1}, offline {2}'.format(nid, n, del_nid)) return del_nid except Exception as e: log.error(traceback.format_exc())
def predict_click(click_info, model_v=None): try: if not model_v: model_v = os.path.split(get_newest_dir(model_base_path))[-1] uid = click_info[0] nid = click_info[1] if isinstance(click_info[2], basestring): time_str = click_info[2] ctime = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S') else: ctime = click_info[2] time_str = ctime.strftime('%Y-%m-%d %H:%M:%S') logger_9990.info("consume click: uid={}, nid={}, time_str={}".format( uid, nid, time_str)) valid_time = ctime + timedelta(days=30) #有效时间定为30天 fail_time = valid_time.strftime('%Y-%m-%d %H:%M:%S') conn, cursor = get_postgredb_query() cursor.execute(nt_sql.format(nid, model_v)) #获取nid可能的话题 rows = cursor.fetchall() for r in rows: topic_id = r[0] probability = r[1] conn2, cursor2 = get_postgredb() cursor2.execute(ut_sql.format(uid, model_v, topic_id)) rows2 = cursor2.fetchone() if rows2: #该用户已经关注过该topic_id, 更新probability即可 new_prop = probability + rows2[0] logger_9990.info('update: uid={}, topic_id={}'.format( uid, topic_id)) cursor2.execute( ut_update_sql.format(new_prop, time_str, fail_time, uid, model_v, topic_id)) else: cursor2.execute( user_topic_insert_sql.format(uid, model_v, topic_id, probability, time_str, fail_time)) conn2.commit() conn2.close() cursor.close() conn.close() except: traceback.print_exc()
def get_news_words(nid_list): conn, cursor = doc_process.get_postgredb_query() nids_str = ','.join([str(i) for i in nid_list]) cursor.execute(news_word_sql.format(nids_str)) rows = cursor.fetchall() conn.close() nid_words_dict = {} for r in rows: nid = r[0] title = r[1] paragraphs = r[2] txt = '' for para in paragraphs: if 'txt' in para.keys(): txt += para['txt'] total_txt = title + txt.encode('utf-8') word_list = doc_process.filter_html_stopwords_pos(total_txt, remove_num=True, remove_single_word=True) nid_words_dict[nid] = ' '.join(word_list) return nid_words_dict
def coll_user_topics(model_v): # uid=0是旧版app,没有确切的uid。所有旧版app的使用者的id都是0 if TEST_FLAG: user_topic_prop_sql = '''select uid, topic_id, probability from user_topics_v2 where model_v = '{}' and uid != 0 and create_time > now() - interval '10 minute' ''' else: user_topic_prop_sql = '''select uid, topic_id, probability from user_topics_v2 where model_v = '{}' and uid != 0 and create_time > now() - interval '3 day' ''' try: log_cf.info(' coll_user_topics begin ...') conn, cursor = get_postgredb_query() cursor.execute(user_topic_prop_sql.format(model_v)) rows = cursor.fetchall() user_ids = [] topic_ids = [] props = [] log_cf.info(' query user topic finished. {} item found.'.format(len(rows))) user_topic_prop_dict = {} for r in rows: user_ids.append(r[0]) topic_ids.append(r[1]) props.append(r[2]) if r[0] not in user_topic_prop_dict: user_topic_prop_dict[r[0]] = dict() user_topic_prop_dict[r[0]][r[1]] = r[2] log_cf.info(' coll_user_topics end') del rows cursor.close() conn.close() if TEST_FLAG: f = os.path.join(real_dir_path, 'data', 'user_topic.csv') df = {'user':user_ids, 'topic':topic_ids, 'prop':props} pd.DataFrame(df).to_csv(f, columns=('user', 'topic', 'prop')) return user_topic_prop_dict, user_ids, topic_ids, props except: traceback.print_exc() log_cf.exception(traceback.format_exc())
def predict_chnl_news(chnl_name, num_limit=None): ''' 预测特定频道的新闻 :param chnl_name: 频道名称 :param num_limit: 数量限制 :return: ''' logger_chnl.info('begin to predict {}'.format(chnl_name)) conn, cursor = get_postgredb_query() if num_limit: chnl_sql = '''select nid from info_news a inner join channellist_v2 cl on a.chid=cl.id where cl.cname={} limit {}''' logger_chnl.info(cursor.mogrify(chnl_sql.format(chnl_name))) cursor.execute(chnl_sql.format(chnl_name, num_limit)) else: chnl_sql = '''select nid from info_news a inner join channellist_v2 cl on a.chid=cl.id where cl.cname=\'{}\'''' logger_chnl.info(cursor.mogrify(chnl_sql.format(chnl_name))) cursor.execute(chnl_sql.format(chnl_name)) rows = cursor.fetchall() nids = [r[0] for r in rows] l = len(nids) logger_chnl.info('len of nids is {}'.format(l)) #分段预测 if l < 1000: kmeans_predict(nids, logger_chnl) else: n = 0 while (n + 1000) < l: kmeans_predict(nids[n:n + 1000], logger_chnl) n += 1000 logger_chnl.info('{} of {} finished!'.format(n, l)) kmeans_predict(nids[n:l], logger_chnl) logger_chnl.info('predict {} finished!'.format(chnl_name)) cursor.close() conn.close()
def get(self): try: print '----deal old news and click----' from graphlab_lda import topic_model_model from redis_process import nid_queue from util import doc_process conn, cursor = doc_process.get_postgredb_query() nid_queue.clear_queue_click() nid_queue.clear_queue_lda() #清空旧nid s_new = "select nid from newslist_v2 where ctime > now() - interval '10 day' and chid not in (28, 23, 21, 44) and state=0" cursor.execute(s_new) rows = cursor.fetchall() nids = [] for r in rows: nids.append(r[0]) l = len(nids) if len(nids) < 1000: topic_model_model.predict_nids(nids) else: n = 0 while (n + 1000) < len(nids): topic_model_model.predict_nids(nids[n:n + 1000]) n += 1000 print('{} of {} finished!'.format(n, l)) topic_model_model.predict_nids(nids[n - 1000:len(nids)]) print ' ----- finish to predict news, begin to predict click-----' s_click = "select uid, nid, ctime from newsrecommendclick where (ctime > now() - interval '10 day') and (ctime < now() - interval '1.5 day') " cursor.execute(s_click) clicks = tuple(cursor.fetchall()) topic_model_model.predict_clicks(clicks) print '----------- finish to predict clicks--------' conn.close() except: traceback.print_exc()
def test_special_space(): from util.doc_process import get_postgredb_query sql = "select title, content from newslist_v2 where nid = 13282986" conn, cursor = get_postgredb_query() cursor.execute(sql) rows = cursor.fetchall() for row in rows: title = row[0] content_list = row[1] txt = '' for content in content_list: if 'txt' in content.keys(): txt += content['txt'] + ' ' #unicode soup = BeautifulSoup(txt, 'lxml') txt = soup.get_text() total_txt = title + ' ' + txt.encode('utf-8') print total_txt total_txt = ''.join(total_txt.split()) print total_txt total_txt = total_txt.replace('\xe2\x80\x8b', '') total_txt = total_txt.replace('\xe2\x80\x8c', '') total_txt = total_txt.replace('\xe2\x80\x8d', '') from pyltp import Postagger poser = Postagger() poser.load('/Users/a000/git/ltp_data/pos.model') from pyltp import Segmentor segmentor = Segmentor() segmentor.load('/Users/a000/git/ltp_data/cws.model') ws = segmentor.segment(total_txt) wspos = poser.postag(ws) for k, i in enumerate(wspos): print ws[k] print i if k > 300: break
def create_subject_class(sub_id): time = datetime.datetime.now() #class_name = str(time.month) + '.' + str(time.day) + '.' + str(time.hour) + '.' + str(time.minute) class_name = str(time.month) + '.' + str(time.day) #检测是否已经存在 #check_class_ex = "select id, name, order from topicclasslist where topic=%s and name=%s" check_class_ex = "select id, name, \"order\" from topicclasslist where topic=%s" conn, cursor = get_postgredb_query() cursor.execute(check_class_ex, (sub_id, )) rows = cursor.fetchall() conn.close() new_order = -1 for row in rows: if row[1] == class_name: return row[0] new_order = max(new_order, row[2]) data = {'topic': sub_id, 'name': class_name, 'order': new_order + 1} try: response = requests.post(topic_class_url, data=data, cookies=cookie) return json.loads(response.content)['id'] except: logger_sub.exception(response.content) raise
def kmeans_predict(nid_list): global g_channel_kmeans_model_dict, chname_id_dict print "****************************************************" + model_v if len(g_channel_kmeans_model_dict) == 0: load_newest_models() if (len(chname_id_dict)) == 0: get_chname_id_dict() nid_info = {} for nid in nid_list: conn, cursor = doc_process.get_postgredb_query() cursor.execute(nid_sql, [nid]) row = cursor.fetchone() if not row: print 'Error: do not get info of nid: ' + str(nid) continue title = row[0] content_list = row[1] chanl_name = row[2] if chanl_name not in g_channel_kmeans_model_dict: continue txt = '' for content in content_list: if 'txt' in content.keys(): txt += content['txt'] total_txt = title + txt.encode('utf-8') #word_list = doc_process.filter_html_stopwords_pos(total_txt, remove_num=True, remove_single_word=True) total_txt = cut_pos_ltp(total_txt) nid_info[nid] = [chanl_name, total_txt] cursor.close() conn.close() ch_pred_dict = {} for chname in g_channel_kmeans_model_dict.keys(): clstid_nid_dict = {} print 'predict ---- ' + chname nids = [] doc_list = [] for nid in nid_info.keys(): if nid_info[nid][0] == chname: nids.append(nid) doc_list.append(nid_info[nid][1]) print 'news num of ' + chname + ' is ' + str(len(nids)) if len(nids) == 0: continue logger_update.info('type of doc_list is {}'.format(type(doc_list[0]))) ws = gl.SArray(doc_list) docs = gl.SFrame(data={'X1': ws}) docs = gl.text_analytics.count_words(docs['X1']) docs = gl.SFrame(docs) pred = g_channel_kmeans_model_dict[chname].predict( docs, output_type='cluster_id') print pred logger_update.info('result : {}'.format(pred)) if len(nids) != len(pred): print 'len(nids) != len(pred)' return for i in xrange(0, len(pred)): if pred[i] not in clstid_nid_dict.keys(): clstid_nid_dict[pred[i]] = [] clstid_nid_dict[pred[i]].append(nids[i]) ch_pred_dict[chname] = clstid_nid_dict #print clstid_nid_dict return ch_pred_dict
def cal_process(nid_set, log=None, same_t=3, news_interval=3, same_dict = {}): log = logger_9965 log.info('there are {} news to calulate'.format(len(nid_set))) ttt1 = datetime.datetime.now() try: nid_sents_dict, nid_para_links_dict, nid_pname_dict = get_nids_sentences(nid_set) kkkk = 0 for item in nid_sents_dict.items(): #每条新闻 #存放专题, 每个元素包含关键句和新闻id两个列表 #例如[[['abc', 'aaa'], [123, 231]], [['bcd', 'bbb'], [542, 126]] ] subject_sentence_nids = [] kkkk += 1 n = 0 nid = item[0] log.info(' cal {} sentences...'.format(nid)) #log.info('--- consume :{}'.format(nid)) t = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') para_sent_dict = item[1] sen_len = 0 #文章总句子数目 for pa in para_sent_dict.items(): #每个段落 sen_len += len(pa[1]) for pa in para_sent_dict.items(): para_num = pa[0] #段落号 sents = pa[1] conn, cursor = get_postgredb() conn_query, cursor_query = get_postgredb_query() for s in sents: #每个句子 n += 1 #ts1 = datetime.datetime.now() #print '-------1' #print ts1 str_no_html, wl = filter_html_stopwords_pos(s, False, True, True, False) #if len(wl) == 0 or len(str_no_html) <= 2: #去除一个字的句子,因为有很多是特殊字符 #if len(wl) == 0 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符 #if len(wl) == 10 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符 if len(wl) <= 10 : #去除一个字的句子,因为有很多是特殊字符 continue #ts2 = datetime.datetime.now() #print '-------2' #print ts2 h = simhash.simhash(wl) check_exist_sql = "select nid from news_sentence_hash_cache where nid=%s and hash_val=%s" #该新闻中已经有这个句子,即有重复句子存在 cursor_query.execute(check_exist_sql, (nid, h.__str__())) #ts3 = datetime.datetime.now() #print '-------3' #print ts3 if len(cursor_query.fetchall()) != 0: #log.info('sentence has existed in this news: {}'.format(str_no_html.encode("utf-8"))) continue fir, sec, thi, fou, fir2, sec2, thi2, fou2 = simhash.get_4_segments(h.__long__()) if is_sentence_ads(h, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nid_pname_dict[nid]): #在广告db内 # 删除广告句子 #log.info('find ads of {0} : {1} '.format(nid, str_no_html.encode("utf-8"))) continue #ts4 = datetime.datetime.now() #print '-------4' #print ts4 cursor_query.execute(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval)) #print cursor.mogrify(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval)) rows = cursor_query.fetchall() #所有可能相同的段落 #print 'len of potential same sentence is {}'.format(len(rows)) if len(rows) == 0: #没有相似的句子 #将所有句子入库 cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2)) #logger_9965.info('len of potential same sentence is 0') continue #else: #logger_9965.info('len of potential same sentence is {}'.format(len(rows))) #ts5 = datetime.datetime.now() #print '-------5' #print ts5 same_sentence_sql_para = [] nids_for_ads = set() for r in rows: #if len(nids_for_ads) >= 15: #break #距离过大或者是同一篇新闻 if h.hamming_distance_with_val(long(r[1])) > same_t or (nid in same_dict.keys() and r[0] in same_dict[nid]) or nid == r[0]: #logger_9965.info('distance is too big or same news of {} and {}'.format(nid, r[0])) continue cursor_query.execute(same_sql2, (r[0], r[1])) rs = cursor_query.fetchall() for r2 in rs: sen = r2[0].decode('utf-8') sen_without_html = filter_tags(sen) if len(sen) == 1 or len(sen_without_html) > len(str_no_html)*1.5 or len(str_no_html) > len(sen_without_html)*1.5: #logger_9965.info('sentence len mismatch: {} ----{}'.format(str_no_html.encode('utf-8'), sen_without_html)) continue wl1 = jieba.cut(str_no_html) set1 = set(wl1) l1 = len(set1) wl2 = jieba.cut(sen_without_html) set2 = set(wl2) set_same = set1 & set2 l2 = len(set2) l3 = len(set_same) if l3 < max(l1, l2) * 0.6: #相同比例要达到0.6 continue nids_for_ads.add(str(r[0])) same_sentence_sql_para.append((nid, r[0], str_no_html, sen, t)) #cursor.execute(insert_same_sentence, (nid, r[0], str_no_html, sen, t)) #print cursor.mogrify(insert_same_sentence, (nid, r[0], str_no_html, sen_without_html, t)) #ts6 = datetime.datetime.now() #print '-------6' #print ts6 if len(nids_for_ads) == 0: #没有潜在相同的句子; 这些句子先做广告检测 cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2)) conn.commit() continue is_new_ads = False not_ads_but_ignore = False #不是广告,但需要忽略计算重复 PNAME_T = 3 nid_pn = {} pname_set = set() chid_set = set() ctime_list = [] #print cursor.mogrify(get_pname, (tuple(nids_for_ads),)) cursor_query.execute(get_pname, (tuple(nids_for_ads),)) rows2 = cursor_query.fetchall() for rk in rows2: pname_set.add(rk[0]) chid_set.add(rk[1]) ctime_list.append(rk[2]) nid_pn[rk[3]] = rk[0] if len(nids_for_ads) / float(len(pname_set)) > 3: #2017.06.13 添加 is_new_ads = True if len(nids_for_ads) >= 10: #先处理同源潜在广告 if len(pname_set) <= PNAME_T or (len(pname_set) > 5 and len(chid_set) < 4): #if n > sen_len * .2 and n < sen_len * .8: if float(n) < float(sen_len * .2) or float(n) > float(sen_len * .8): min_time = ctime_list[0] max_time = ctime_list[0] for kkk in xrange(1, len(ctime_list)): if ctime_list[kkk] > max_time: max_time = ctime_list[kkk] if ctime_list[kkk] < min_time: min_time = ctime_list[kkk] if (max_time - min_time).days > 2: #不是两天内的热点新闻 is_new_ads = True ''' nid_links = nid_para_links_dict[nid] sum_own_links = 0 #有链接的段落数 for kk in xrange(para_num, len(nid_links)): if len(nid_links[kk]): sum_own_links += 1 if sum_own_links > (len(nid_links) - para_num) * 0.8: #后面的链接很多,认为是广告 is_new_ads = True elif len(pname_set) > 5 and len(chid_set) < 4: #来自多个源, 看是否集中在几个频道,如果是,则认为是广告 #需要判断这些新闻入库时间不集中在3天内,否则可能不是广告 min_time = ctime_list[0] max_time = ctime_list[0] for kkk in xrange(1, len(ctime_list)): if ctime_list[kkk] > max_time: max_time = ctime_list[kkk] if ctime_list[kkk] < min_time: min_time = ctime_list[kkk] if (max_time - min_time).days > 2: #不是三天内的热点新闻 is_new_ads = True ''' else: not_ads_but_ignore = True #ts7 = datetime.datetime.now() #print '-------7' #print ts7 nids_str = ','.join(nids_for_ads) if is_new_ads: #是否是新广告 if len(pname_set) <= PNAME_T: #源 pname_str = ','.join(pname_set) else: pname_str = "" cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 0, pname_str)) #log.info('find new ads : {0}'.format(str_no_html.encode("utf-8"))) else: #if len(same_sentence_sql_para) < 5: #检测出过多的相同句子,又不是广告, 可能是误判, 不处理 if not_ads_but_ignore: #相同的句子过多,认为是误判, 加入广告数据库,但state=1,即不是真广告,这样可以在下次碰到时减少计算 cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 1, "" )) else: cursor.executemany(insert_same_sentence, same_sentence_sql_para) #有效的重复句子 #log.info('get same sentence map :{}'.format(str_no_html.encode('utf-8'))) #多放观点 1. 句子长度>30. 2 不同源 3. 去除首尾 if len(str_no_html) > 15 and n > 2 and (n < sen_len-2): #if len(str_no_html) > 15: sub_nids_set = set() for same in same_sentence_sql_para: nn = same[1] #nid if nid_pname_dict[nid] != nid_pn[nn]: ctime_sql = "select nid, ctime from info_news where nid = %s or nid=%s" cursor_query.execute(ctime_sql, (same[0], same[1])) ctimes = cursor_query.fetchall() ctime_dict = {} for ct in ctimes: ctime_dict[str(ct[0])] = ct[1] cursor.execute(multo_vp_insert_sql, (str(same[0]), same[2], str(same[1]), same[3], t, ctime_dict[str(same[0])], ctime_dict[str(same[1])])) log.info(' get multi viewpoint :{}'.format(str_no_html.encode('utf-8'))) sub_nids_set.add(same[0]) sub_nids_set.add(same[1]) subject_queue.product_simhash2((same[0], same[1])) #log.info("num of mvp is {}".format(sub_nids_set)) if len(sub_nids_set) >= 2: ## 专题新闻入队列 log.info(' generate subject for {}'.format(sub_nids_set)) #for i in sub_nids_set: # subject_nids.add(i) key_sents = [str_no_html.encode('utf-8'), ] sub_nids = [] for i in sub_nids_set: sub_nids.append(i) subject_sentence_nids.append([key_sents, sub_nids]) #subject_queue.product_subject(tuple(nid_set)) #将所有段落入库 cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2)) conn.commit() cursor.close() conn.close() cursor_query.close() conn_query.close() if len(subject_sentence_nids) > 0 and len(subject_sentence_nids) < 3: #log.info("before merge : {}".format(subject_sentence_nids)) subs = merge_subs(subject_sentence_nids) #log.info("after merge : {}".format(subs)) for sub in subs: subject_queue.product_subject(sub) #log.info('generate subject for {} ------ {}'.format(nid, subject_nids)) #subject_queue.product_subject(tuple(subject_nids)) ttt2 = datetime.datetime.now() log.info('it takes {}'.format((ttt2-ttt1).total_seconds())) del nid_sents_dict del nid_para_links_dict except: log.exception(traceback.format_exc())