@author: hao ''' import os, sys from sys import path path.append(path[0] + '/tools/') import MySQLdb import pymongo import jieba.analyse import time import datetime # 工具类 from remove import removeIrrelevant reload(sys) sys.setdefaultencoding('utf8') rmIrr = removeIrrelevant() def addTwitterTable(mongodbIP, mysqlhostIP, mysqlUserName='******', mysqlPassword='******', dbname='cctvTimer'): sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor()
def addcustomerEvaluation_informal(baseurl, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv_v2'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS customer_evaluation(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, flag int(1), evaluation bigint(20), content varchar(200), date Date, program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 时间属性 # inter为0,即为当日 # 库中是2.29 inter = 1 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # **********new by Ivy********************************************** os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) tablename_index = "DATA:WEIBO_POST_Keywords_INDEX" prefix = otherStyleTime + "*" # print baseurl + "/" + tablename_index + "/" + prefix r_index = requests.get(baseurl + "/" + tablename_index + "/" + prefix, auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r_index) == False: print "Could not get messages from HBase. Text was:\n" + r_index.text # print base64.b64decode(r.text) # print r_index.text bleats_0 = json.loads(r_index.text) row_key_name_list = list() for row in bleats_0['Row']: row_key = base64.b64decode(row['key']) row_key_name = row_key.split("-")[3] # print row_key, row_key_name row_key_name_list.append(row_key_name) print len(row_key_name_list) # **********new by Ivy********************************************** # 存储评论数据 # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() sqlcursor.execute( "SELECT DISTINCT(program_name) from ini_app_program_source_rel") # sqlcursor.execute("SELECT program_name from ini_app_program_source_rel where program_name = '跨界喜剧王' limit 1;") bufferTemp = sqlcursor.fetchall() os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) # tablename = "DATA:WEIBO_POST_Keywords" # sqlcursor.execute('''select source_path from ini_app_program_source_rel where program_name = '军情解码' and app_name = '观众口碑';''') for one_program in bufferTemp: commentsData = [] tempData = [] one_program = one_program[0].encode('utf8') if one_program == '中国好歌曲第三季': otherStyleTime = '2015-10-26' # print one_program emotionsWord = [] emotionsScore = 0 count = 0 printCount = 0 sqlcursor.execute( """select source_path from ini_app_program_source_rel where app_name = '观众口碑' and program_name = %s""", (one_program, )) source_path = sqlcursor.fetchone() tablename = source_path[0] # print one_program,"source_path", source_path print one_program sqlcursor.execute( '''SELECT program_id from ini_app_program_source_rel where program_name = %s limit 1;''', (one_program, )) bufferTemp = sqlcursor.fetchone() program_id = bufferTemp[0] # **********new by Ivy********************************************** for single_rowkey in row_key_name_list: r = requests.get(baseurl + "/" + tablename + "/" + single_rowkey, auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text # print r.text bleats = json.loads(r.text) # **********new by Ivy********************************************** # print program_id # bleats is json file for row in bleats['Row']: flag = True for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:match": column = base64.b64decode(value) if (one_program not in column): flag = False break if columnname == "base_info:cdate": cdate = base64.b64decode(value) cdate = cdate.split('T')[0] # if cdate != otherStyleTime: # flag = False # break if columnname == "base_info:text": content = base64.b64decode(value) # print 'hh', content if flag: # print column, cdate, content # 暂时没有program_id # program_id = data['base_info:program_id'] count += 1 printCount += 1 # 处理每一条 # 情感关键词 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(content)) # 倾向性判断flag:1是正面,0是中性,-1是负面 # 情感极性判断,这里我限制了更严格的条件 if emotionsScore > 0: tempData.append('1') elif emotionsScore == 0: tempData.append('0') elif emotionsScore < 0: tempData.append('-1') # 情感得分sentimentScore tempData.append(emotionsScore) # 评论内容 tempData.append(content) # 日期时间 otherStyleTime = time.strftime("%Y-%m-%d", timeArray) tempData.append(otherStyleTime) # 栏目id tempData.append(program_id) # 栏目名称 tempData.append(one_program) # 转换为元组 sqlcursor.execute( '''insert into customer_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', tempData) sqlConn.commit() tempData = [] sqlConn.close()
def mentioned_trend(baseurl, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv'): # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS gala_discussion(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, disscussion bigint(50), specific_time datetime, neutral double, positive double, program_id varchar(50), program varchar(50)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' emProcess = emotionProcess() rmIrr = removeIrrelevant() # 连接hbase数据库 os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) # 表名 tablename = "DATA:WEIBO_POST_Keywords" r = requests.get(baseurl + "/" + tablename + "/*", auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text quit() bleats = json.loads(r.text) tempData = [] positive_count = 0 neutral_count = 0 total_count = 0 # 时间属性 # inter为0,即为当日 inter = 0 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # bleats is json file for row in bleats['Row']: for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:match": column = base64.b64decode(value) if column == "春晚": if columnname == "'base_info:text'": content = base64.b64decode(value) if columnname == "'base_info:cdate'": cdate = base64.b64decode(value) ddate = cdate ddate = ddate.split(' ')[0] if ddate == otherStyleTime: cdate = str(cdate) + ':00' # cdate = cdate.split(' ')[0] total_count += 1 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(content)) # 倾向性判断flag:1是正面,0是中性,-1是负面 # 情感极性判断,这里我限制了更严格的条件 if emotionsScore > 0: positive_count += 1 elif emotionsScore == 0: neutral_count += 1 print "po", positive_count, 'ne', neutral_count, "total", total_count positive_count = float(positive_count) / float(total_count) neutral_count = float(neutral_count) / float(total_count) print 'tt', type(neutral_count), neutral_count # disscussion tempData.append(total_count) # specific_time???需要修改 tempData.append(cdate) # neutral tempData.append(neutral_count) # positive tempData.append(positive_count) # program_id tempData.append('100') # program tempData.append('2016年北京卫视春节联欢晚会') sqlcursor.execute( '''insert into gala_discussion(disscussion, specific_time, neutral, positive, program_id, program) values (%s, %s, %s, %s, %s, %s)''', tempData) sqlConn.commit() tempData = [] sqlConn.close()
def addcustomerEvaluation_informal(baseurl, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS customer_evaluation(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, flag int(1), evaluation bigint(20), content varchar(200), date Date, program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 时间属性 # inter为0,即为当日 # 库中是2.29 inter = 37 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # 存储评论数据 source_1 = 'weibo' # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() # 首先获取栏目,注意栏目及相关hbase存储信息有更新,请在hbase_info做同步 # print 'SELECT DISTINCT(program) from hbase_info where source = %s' %source_1 sqlcursor.execute("SELECT DISTINCT(program) from hbase_info;") bufferTemp = sqlcursor.fetchall() # print len(bufferTemp) os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) tablename = "DATA:WEIBO_POST_Keywords" r = requests.get(baseurl + "/" + tablename + "/*", auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text # quit() bleats = json.loads(r.text) for one_program in bufferTemp: commentsData = [] tempData = [] one_program = one_program[0].encode('utf8') print type(one_program), one_program emotionsWord = [] emotionsScore = 0 count = 0 printCount = 0 sqlcursor.execute( '''SELECT program_id from competition_analysis where program = %s''', (one_program, )) bufferTemp = sqlcursor.fetchone() program_id = bufferTemp[0] print program_id # row_prefix, limit可以限定次数 # bleats is json file for row in bleats['Row']: for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:match": column = base64.b64decode(value) if column == one_program: if columnname == "'base_info:cdate'": date_created = base64.b64decode(value) if date_created == otherStyleTime: if columnname == "'base_info:text'": content = base64.b64decode(value) print 'q', content # 暂时没有program_id # program_id = data['base_info:program_id'] count += 1 printCount += 1 # 处理每一条 # 情感关键词 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(content)) # 倾向性判断flag:1是正面,0是中性,-1是负面 # 情感极性判断,这里我限制了更严格的条件 if emotionsScore > 0: tempData.append('1') elif emotionsScore == 0: tempData.append('0') elif emotionsScore < 0: tempData.append('-1') # 情感得分sentimentScore tempData.append(emotionsScore) # 评论内容 tempData.append(content) # 日期时间 tempData.append(otherStyleTime) # 栏目id tempData.append(program_id) # 栏目名称 tempData.append(one_program) # 转换为元组 commentsData.append(tuple(tempData)) tempData = [] sqlcursor.execute( '''insert into customer_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() commentsData = [] sqlConn.close()
def addcustomerEvaluation_informal(baseurl, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv_v2'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS vedio_index(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, total_index bigint(20), play_count bigint(20), comment_count bigint(20), date Date, program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 时间属性 # inter为0,即为当日 # 库中是2.29 inter = 1 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # **********new by Ivy********************************************** os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) tablename_index = "DATA:VIDEO_ZONGYI_INDEX" prefix = otherStyleTime + "*" # print baseurl + "/" + tablename_index + "/" + prefix r_index = requests.get(baseurl + "/" + tablename_index + "/" + prefix, auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r_index) == False: print "Could not get messages from HBase table VIDEO_ZONGYI_INDEX. Text was:\n" + r_index.text # print base64.b64decode(r.text) # print r_index.text bleats_0 = json.loads(r_index.text) row_key_name_list = list() for row in bleats_0['Row']: row_key = base64.b64decode(row['key']) row_key_name = row_key.split("-")[3] if row_key_name not in row_key_name_list: row_key_name_list.append(row_key_name) print len(row_key_name_list) # **********new by Ivy********************************************** emProcess = emotionProcess() rmIrr = removeIrrelevant() sqlcursor.execute( "SELECT program_name from ini_app_program_source_rel where program_name = '跨界喜剧王' limit 1;" ) bufferTemp = sqlcursor.fetchall() os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) for one_program in bufferTemp: commentsData = [] tempData = [] play_count = 0 comment_count = 0 play_count_box = list() comment_count_box = list() one_program = one_program[0].encode('utf8') if one_program == '中国好歌曲第三季': otherStyleTime = '2015-10-26' tablename = "DATA:VIDEO_ZONGYI" print one_program sqlcursor.execute( '''SELECT program_id from ini_app_program_source_rel where program_name = %s limit 1;''', (one_program, )) bufferTemp = sqlcursor.fetchone() program_id = bufferTemp[0] # **********new by Ivy********************************************** for single_rowkey in row_key_name_list: r = requests.get(baseurl + "/" + tablename + "/" + single_rowkey + "*", auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text # print r.text try: bleats = json.loads(r.text) except: pass # **********new by Ivy********************************************** # bleats is json file vedio_name = "" for row in bleats['Row']: flag = True for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:column_name": column = base64.b64decode(value) if (one_program not in column): flag = False break if columnname == "base_info:play_count": play_count = base64.b64decode(value) play_count = int(play_count) if columnname == "base_info:comment_count": comment_count = base64.b64decode(value) comment_count = int(comment_count) if columnname == "base_info:name": vedio_name = base64.b64decode(value) if flag: play_count_box.append(play_count) comment_count_box.append(comment_count) # 播放指数 total_play_count = sum(play_count_box) total_comment_count = sum(comment_count_box) total_index = total_play_count / 10000000 * 2 + total_comment_count / 100 print "total_index", total_index tempData.append(total_index) # 播放次数 tempData.append(total_play_count) # 评论次数 tempData.append(total_comment_count) # 日期时间 otherStyleTime = time.strftime("%Y-%m-%d", timeArray) tempData.append(otherStyleTime) # 栏目id tempData.append(program_id) # 栏目名称 tempData.append(one_program) # 转换为元组 sqlcursor.execute( '''insert into vedio_index(total_index, play_count, comment_count, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', tempData) sqlConn.commit() tempData = [] sqlConn.close()
def addcustomerEvaluation_informal(baseurl,mysqlhostIP, mysqlUserName = '******', mysqlPassword = '', dbname = 'btv_v2'): path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path+'/tools/NTUSD_simplified/stopwords.txt','r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n','').decode('utf8') stopwordList.append(temp) dicFile.close() sqlConn=MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db = dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute('''CREATE TABLE IF NOT EXISTS media_evaluation(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, flag int(1), evaluation bigint(20), content varchar(200), date Date, program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;''') print '新建库成功' inter = 2 now = int(time.time())-86400*inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) emProcess = emotionProcess() rmIrr = removeIrrelevant() sqlcursor.execute("SELECT DISTINCT(program_name) from ini_app_program_source_rel") bufferTemp = sqlcursor.fetchall() os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) tibeid_name = dict() tiba_id_box = list() for one_program in bufferTemp: commentsData = [] tempData = [] one_program = one_program[0].encode('utf8') # print one_program emotionsWord = [] emotionsScore = 0 count = 0 printCount = 0 tablename = "DATA:TIEBA_SITE" prefix_0 = otherStyleTime + "*" r = requests.get(baseurl + "/" + tablename + "/" + prefix_0, auth=kerberos_auth, headers = {"Accept" : "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text bleats = json.loads(r.text) for row in bleats['Row']: flag = True row_key = base64.b64decode(row['key']) tiba_id = row_key.split("-")[3] for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:name": tieba_name = base64.b64decode(value) if (one_program not in tieba_name): flag = False break if flag: tibeid_name[tiba_id] = tieba_name if tiba_id not in tiba_id_box: tiba_id_box.append(tiba_id_box) print "len(tiba_id_box)", len(tiba_id_box) tablename = "DATA:TIEBA_SITE_Reply" for one_tieba_id in tiba_id_box: for i,j in tibeid_name.iteritems(): if one_tieba_id == i: one_program = j prefix_1 = otherStyleTime + "-" + str(i) + "*" r = requests.get(baseurl + "/" + tablename + "/" + prefix_1, auth=kerberos_auth, headers = {"Accept" : "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text bleats = json.loads(r.text) sqlcursor.execute('''SELECT program_id from ini_app_program_source_rel where program_name = %s limit 1;''', (one_program,)) bufferTemp = sqlcursor.fetchone() program_id = bufferTemp[0] for row in bleats['Row']: flag = True for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:cdate": cdate = base64.b64decode(value) cdate = cdate.split('T')[0] if columnname == "base_info:content": content = base64.b64decode(value) if flag: (emotionsWord,emotionsScore) = emProcess.processSentence(rmIrr.removeEverythingButEmotion(content)) if emotionsScore>0: tempData.append('1') elif emotionsScore==0: tempData.append('0') elif emotionsScore<0: tempData.append('-1') # 情感得分sentimentScore tempData.append(emotionsScore) # 评论内容 tempData.append(content) # 日期时间 otherStyleTime = time.strftime("%Y-%m-%d", timeArray) tempData.append(otherStyleTime) # 栏目id tempData.append(program_id) # 栏目名称 tempData.append(one_program) # 转换为元组 sqlcursor.execute('''insert into media_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''',tempData) sqlConn.commit() tempData = [] sqlConn.close()
def addcustomerEvaluation_informal(hbaseIP, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 分词 jieba.initialize() source_1 = 'weibo' # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS media_evaluation(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, flag int(1), evaluation bigint(20), content varchar(200), date Date, program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 时间属性 # inter为0,即为当日 # 库中是2.29 inter = 37 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # 连接hbase数据库 conn = happybase.Connection(hbaseIP) conn.open() # 存储评论数据 # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() # 首先获取栏目,注意栏目及相关hbase存储信息有更新,请在hbase_info做同步 # print 'SELECT DISTINCT(program) from hbase_info where source = %s' %source_1 sqlcursor.execute( "SELECT DISTINCT(program) from hbase_info where source = 'buzz';") bufferTemp = sqlcursor.fetchall() # print len(bufferTemp) for one_program in bufferTemp: commentsData = [] tempData = [] one_program = one_program[0].encode('utf8') print type(one_program), one_program sqlcursor.execute( '''SELECT hbase_table from hbase_info where program = %s and source = %s;''', (one_program, source_1)) bufferTemp = sqlcursor.fetchone() program_hbase_table = bufferTemp[0] print program_hbase_table # 以“JQJM”为关键词的微博原贴代替,按理应该是底下的评论 table = conn.table(str(program_hbase_table)) # customerEvaluation_informal需要有program标识 emotionsWord = [] emotionsScore = 0 count = 0 printCount = 0 sqlcursor.execute( '''SELECT program_id from competition_analysis where program = %s''', (one_program, )) bufferTemp = sqlcursor.fetchone() program_id = bufferTemp[0] print program_id # row_prefix, limit可以限定次数 for key, data in table.scan(limit=10, batch_size=10): # print 'hhh',key,data # for key,data in table.scan(row_prefix = 'row', limit = 10, batch_size = 10): date_created = data['base_info:cdate'] if date_created == otherStyleTime: content = data['base_info:text'] print 'q', content # 暂时没有program_id # program_id = data['base_info:program_id'] count += 1 printCount += 1 # 处理每一条 # 情感关键词 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(content)) # 倾向性判断flag:1是正面,0是中性,-1是负面 # 情感极性判断,这里我限制了更严格的条件 if emotionsScore > 0: tempData.append('1') elif emotionsScore == 0: tempData.append('0') elif emotionsScore < 0: tempData.append('-1') # 情感得分sentimentScore tempData.append(emotionsScore) # 评论内容 tempData.append(content) # 日期时间 tempData.append(otherStyleTime) # 栏目id tempData.append(program_id) # 栏目名称 tempData.append(one_program) # 转换为元组 commentsData.append(tuple(tempData)) tempData = [] if count >= 10: sqlcursor.executemany( '''insert into media_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() commentsData = [] count = 0 print '插入' + str(printCount) + '个' # # except: # # print tempData sqlcursor.executemany( '''insert into media_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() sqlConn.close()
def addcompetitiveProgram(baseurl,mysqlhostIP, mysqlUserName = '******', mysqlPassword = '', dbname = 'btv'): # 连接数据库 sqlConn=MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db = dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute('''CREATE TABLE IF NOT EXISTS competition_analysis(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, program varchar(50), host varchar(50), time varchar(50), channel varchar(50), introduction varchar(300), program_id varchar(50), image varchar(200), url varchar(200), length varchar(50), program_type_id varchar(2)) DEFAULT CHARSET=utf8;''') print '新建库成功' os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) # 表名 tablename = "DATA:WEIBO_POST_chunwan" r = requests.get(baseurl + "/" + tablename + "/*", auth=kerberos_auth, headers = {"Accept" : "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text quit() bleats = json.loads(r.text) # 连接hbase数据库 # 存储评论数据 commentsData = [] tempData = [] # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() # table=conn.table('competitiveProgram') # customerEvaluation_informal需要有program标识 count = 0 printCount = 0 # 时间属性 # inter为0,即为当日 inter = 0 now = int(time.time())-86400*inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # row_prefix, limit可以限定次数 # bleats is json file for row in bleats['Row']: for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue # 播出频道 if columnname == "'base_info:channel'": channel = base64.b64decode(value) # 因为是基本信息的抓取,不需要date # 播出时间 if columnname == "'base_info:Broadcast_time'": Broadcast_time = base64.b64decode(value) # 每集长度 if columnname == "'base_info:period'": period = base64.b64decode(value) # 主持人 if columnname == "'base_info:host'": host = base64.b64decode(value) # 简介 if columnname == "'base_info:Info'": Info = base64.b64decode(value) # 竞争栏目id if columnname == "'base_info:program_id'": program_id = base64.b64decode(value) # 竞争栏目名称 if columnname == "'base_info:program'": program = base64.b64decode(value) # url if columnname == "'base_info:url'": url = base64.b64decode(value) count += 1 printCount+=1 # 处理每一条 tempData.append(program) tempData.append(host) tempData.append(Broadcast_time) tempData.append(channel) tempData.append(Info) tempData.append(program_id) # image tempData.append('') tempData.append(url) tempData.append(period) # program_type tempData.append('1') # 转换为元组 commentsData.append(tuple(tempData)) tempData = [] sqlcursor.executemany('''insert into competition_analysis(program, host, time, channel, introduction, program_id, image, url, length, program_type_id) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''',commentsData) sqlConn.commit() commentsData = [] sqlConn.close()
def addTVSeriesTable(mongodbIP,mysqlhostIP,timeRangeBeginning, mysqlUserName = '******', mysqlPassword = '******', dbname = 'cctvTimer'): rmIrr = removeIrrelevant() emProcess = emotionProcess() # 数据库mysql sqlConn=MySQLdb.connect(host = mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword,db = dbname,charset='utf8') sqlcursor = sqlConn.cursor() # sqlcursor.execute('''DROP TABLE IF EXISTs TVSeriesTable;''') # print '删库成功' sqlcursor.execute('''CREATE TABLE IF NOT EXISTS TVSeriesTable(countIndex bigint(64) primary key, weiboId bigint(64), TvSeriesName varchar(64), userId bigint(64), weiboContent varchar(1024), contentWords varchar(1024), weiboKeywords varchar(1024), sentiment varchar(64), sentimentScore int(16), sentimentKeywords varchar(1024), createdTime varchar(128)) DEFAULT CHARSET=utf8;''') print '新建库成功' # mongoDB数据库 mongoConn = pymongo.Connection(host = mongodbIP, port = 27017) # 查询某条微博的index mongoCollection = mongoConn.weibo_db_cctv.weibo mongoCursor = mongoCollection.find().batch_size(30).skip(2495500) # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path+'/tools/NTUSD_simplified/stopwords.txt','r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n','').decode('utf8') stopwordList.append(temp) dicFile.close() # 存储数据 tvseriesData = [] tempData = [] # 计数 sqlcursor.execute('select count(*) from TVSeriesTable;') totalCount = sqlcursor.fetchall() totalCount = list(list(totalCount)[0])[0] count = 0 printCount = totalCount for tvWeibo in mongoCursor: count+=1 printCount+=1 # countIndex tempData.append(printCount) # weiboId tempData.append(tvWeibo['wid']) # TvSeriesName tempData.append(tvWeibo['keyword']) # userId tempData.append(tvWeibo['uid']) # weiboContent tempData.append(tvWeibo['content']) # contentWords tempcut_out = jieba.cut(rmIrr.removeEverything(tvWeibo['content'])) cut_out = [] for i in tempcut_out: if i not in stopwordList: cut_out.append(i) tempData.append(','.join(cut_out)) # weiboKeywords temp = jieba.analyse.extract_tags(rmIrr.removeEverything(tvWeibo['content']),5) weiboKeywords = ','.join(temp) tempData.append(weiboKeywords) # sentiment (emotionsWord,emotionsScore) = emProcess.processSentence(rmIrr.removeEverythingButEmotion(tvWeibo['content'])) if emotionsScore>0: tempData.append('正面') elif emotionsScore==0: tempData.append('中立') else: tempData.append('负面') # sentimentScore tempData.append(emotionsScore) # sentimentKeywords emotionsWord = ','.join(emotionsWord) tempData.append(emotionsWord) # createdTime createdTime = time.ctime(tvWeibo['time']) tempData.append(createdTime) # 执行插入 tvseriesData.append(tuple(tempData)) tempData = [] if count>=100: sqlcursor.executemany('''insert into TVSeriesTable(countIndex, weiboId, TvSeriesName, userId, weiboContent, contentWords, weiboKeywords, sentiment, sentimentScore, sentimentKeywords, createdTime) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''',tvseriesData) sqlConn.commit() tvseriesData = [] count = 0 print '插入'+str(printCount)+'个' sqlcursor.executemany('''insert into TVSeriesTable(countIndex, weiboId, TvSeriesName, userId, weiboContent, contentWords, weiboKeywords, sentiment, sentimentScore, sentimentKeywords, createdTime) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''',tvseriesData) sqlConn.commit() mongoConn.close() sqlConn.close()
def addCommentTable(mongodbIP, mysqlhostIP, mysqlUserName='******', mysqlPassword='******', dbname='cctvTimer'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 分词 jieba.initialize() # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() # 删库 # sqlcursor.execute('''DROP TABLE IF EXISTs commentTable;''') # print '删库成功' sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS commentTable(countIndex bigint(64) primary key, commentId bigint(64), weiboId bigint(64), userId bigint(64), comment varchar(1024), sentimentKeywords varchar(128), contentKeywords varchar(1024), sentiment varchar(16), sentimentScore int(16), userName varchar(64), userSex varchar(16), userLocation varchar(64), userFollowerCount int(64), userFriendCount int(64), userStatusCount int(64), userType varchar(32), spammerJudge varchar(16), replyTime varchar(128)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 连接mongoDB数据库 mongoConn = pymongo.Connection(host=mongodbIP, port=27017) # check time mongoCursor = mongoConn.weibo.timestamp.find({'type': 'comment'}).limit(1) timeRangeBeginning = datetime.datetime.now() - datetime.timedelta( days=9999) # print timeRangeBeginning # a=dict() # a['type']='comment' # a['time']=timeRangeBeginning # mongoConn.weibo.timestamp.insert(a) # a=dict() # a['type']='repost' # a['time']=timeRangeBeginning # mongoConn.weibo.timestamp.insert(a) # a=dict() # a['type']='weibo' # a['time']=timeRangeBeginning # mongoConn.weibo.timestamp.insert(a) for i in mongoCursor: timeRangeBeginning = i['time'] newTimestamp = timeRangeBeginning # 查询某条微博的回复 mongoCursor = mongoConn.weibo.comment.find({ 'task_time': { '$gt': timeRangeBeginning } }).sort('task_time').batch_size(30) print '查询mongoDB成功' # 计数 sqlcursor.execute('select count(*) from commentTable;') totalCount = sqlcursor.fetchall() totalCount = list(list(totalCount)[0])[0] # 存储评论数据 commentsData = [] tempData = [] # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() spamDet = spammerdetect() emotionsWord = [] emotionsScore = 0 count = 0 printCount = totalCount # 处理每一条 # try: for comment in mongoCursor: # if comment['task_time']>timeRangeBeginning+ datetime.timedelta(days=1): # continue count += 1 printCount += 1 tempData.append(printCount) # 评论id tempData.append(comment['comment_id']) # 微博id tempData.append(comment['weibo_id']) # 用户id tempData.append(comment['comment_user_id']) # 评论内容 tempData.append(comment['comment_text']) # 情感关键词 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(comment['comment_text'])) emotionsWord = ','.join(emotionsWord) tempData.append(emotionsWord) # print comment['mid'] # print comment['status']['mid'] # print comment['user']['id'] # print comment['text'] # 内容分词 tempcut_out = jieba.cut(rmIrr.removeEverything( comment['comment_text'])) cut_out = [] for i in tempcut_out: if i not in stopwordList: cut_out.append(i) tempData.append(','.join(cut_out)) # 倾向性判断 if emotionsScore > 0: tempData.append('正面') elif emotionsScore == 0: tempData.append('中立') else: tempData.append('负面') # sentimentScore tempData.append(emotionsScore) # 用户昵称 tempData.append(comment['comment_user_name']) # 用户性别 tempData.append(comment['comment_gender']) # 用户地域信息 tempData.append(comment['comment_location']) # 用户粉丝数 tempData.append(comment['comment_followers_count']) # 用户关注数 tempData.append(comment['comment_friends_count']) # 用户微博数 tempData.append(comment['comment_statuses_count']) # 用户类型 if (comment['comment_verified_type'] == -1): tempData.append('普通用户') elif (comment['comment_verified_type'] == 220) or (comment['comment_verified_type'] == 200): tempData.append('微博达人') elif (comment['comment_verified_type'] == 0): tempData.append('个人认证') else: tempData.append('企业认证') # 是否水军 userInfo = {} userInfo['statuses_count'] = comment['comment_statuses_count'] userInfo['followers_count'] = comment['comment_followers_count'] userInfo['friends_count'] = comment['comment_friends_count'] userInfo['bi_followers_count'] = comment['comment_bi_followers_count'] userInfo['domain'] = comment['comment_user_domain'] userInfo['url'] = comment['comment_url'] userInfo['description'] = comment['comment_description'] userInfo['location'] = comment['comment_location'] userInfo['verified'] = comment['comment_verified'] userInfo['verified_type'] = comment['comment_verified_type'] newTimestamp = comment['task_time'] spamScore = spamDet.detectSpammer(userInfo) if spamScore > 0: tempData.append("正常") else: tempData.append("水军") # 回复时间 hh = time.strptime(str(comment['comment_created_at']), '%Y-%m-%d %H:%M:%S') commentTime = time.strftime("%a %b %d %H:%M:%S %Y", hh) tempData.append(commentTime) # 转换为元组 commentsData.append(tuple(tempData)) tempData = [] if count >= 10: sqlcursor.executemany( '''insert into commentTable(countIndex, commentId, weiboId, userId, comment, sentimentKeywords, contentKeywords, sentiment, sentimentScore, userName, userSex,userLocation, userFollowerCount, userFriendCount, userStatusCount, userType,spammerJudge, replyTime) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() commentsData = [] count = 0 print '插入' + str(printCount) + '个' # # except: # # print tempData sqlcursor.executemany( '''insert into commentTable(countIndex, commentId, weiboId, userId, comment, sentimentKeywords, contentKeywords, sentiment, sentimentScore, userName, userSex, userLocation, userFollowerCount, userFriendCount, userStatusCount, userType,spammerJudge, replyTime) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() sqlConn.close() mongoConn.weibo.timestamp.update({'type': 'comment'}, {'$set': { 'time': newTimestamp }}) mongoConn.close()
def mentioned_trend(baseurl, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv_v2'): # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS gala_likability_mentioned(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, channel varchar(20), mentioned bigint(50), likability bigint(50), date date, program_id varchar(50), program varchar(50)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 连接hbase数据库 os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) tablename = "DATA:WEIBO_POST_Keywords" r = requests.get(baseurl + "/" + tablename + "/*", auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text # quit() bleats = json.loads(r.text) # 北京卫视 2.8 # 时间属性 # inter为0,即为当日 # inter = 91 # now = int(time.time())-86400*inter # timeArray = time.localtime(now) # otherStyleTime = time.strftime("%Y-%m-%d", timeArray) # print otherStyleTime # # # 存储评论数据 # sqlcursor.execute('''SELECT mentioned from gala_mentioned_trend where date = %s and program = '2016年北京卫视春节联欢晚会';''',(otherStyleTime,)) # bufferTemp = sqlcursor.fetchone() # chunwan_times = bufferTemp[0] # print chunwan_times # emProcess = emotionProcess() # rmIrr = removeIrrelevant() # tempData = [] # sum_emotionsScore = 0 # # bleats is json file # for row in bleats['Row']: # flag = True # for cell in row['Cell']: # columnname = base64.b64decode(cell['column']) # value = cell['$'] # if value == None: # print 'none' # continue # if columnname == "base_info:match": # column = base64.b64decode(value) # if ("北京卫视春晚" not in column) and ("北京台的春晚" not in column) and ("BTV春晚" not in column) and ("BTV春晚" not in column) and ("bTV春晚" not in column): # flag = False # break # if columnname == "base_info:cdate": # cdate = base64.b64decode(value) # cdate = cdate.split('T')[0] # if cdate != otherStyleTime: # flag = False # break # if columnname == "base_info:text": # content = base64.b64decode(value) # # print 'q',content # # 情感关键词 # if flag: # (emotionsWord,emotionsScore) = emProcess.processSentence(rmIrr.removeEverythingButEmotion(content)) # # 倾向性判断flag:1是正面,0是中性,-1是负面 # # 情感极性判断,这里我限制了更严格的条件 # # channel # # if emotionsScore > 0: # sum_emotionsScore += emotionsScore # tempData.append('北京卫视') # # mentioned # tempData.append(chunwan_times) # # likability # tempData.append(sum_emotionsScore) # # date # tempData.append(otherStyleTime) # # program_id # tempData.append('100') # # program # tempData.append('2016年北京卫视春节联欢晚会') # sqlcursor.execute('''insert into gala_likability_mentioned(channel, mentioned, likability, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''',tempData) # sqlConn.commit() # tempData = [] # 湖南卫视 2.2 # inter为0,即为当日 inter_hn = 97 now = int(time.time()) - 86400 * inter_hn timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print 'bb', otherStyleTime # 存储评论数据 sqlcursor.execute( '''SELECT mentioned from gala_mentioned_trend where date = %s and program = '2016湖南卫视小年夜春晚';''', (otherStyleTime, )) bufferTemp = sqlcursor.fetchone() chunwan_times = bufferTemp[0] print chunwan_times emProcess = emotionProcess() rmIrr = removeIrrelevant() tempData = [] sum_emotionsScore = 0 # bleats is json file for row in bleats['Row']: flag = True for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:match": column = base64.b64decode(value) if ("湖南" not in column) and ("芒果" not in column): flag = False break if columnname == "base_info:cdate": cdate = base64.b64decode(value) cdate = cdate.split('T')[0] if cdate != otherStyleTime: flag = False break if columnname == "base_info:text": content = base64.b64decode(value) # print 'q',content # 情感关键词 if flag: (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(content)) # 倾向性判断flag:1是正面,0是中性,-1是负面 # 情感极性判断,这里我限制了更严格的条件 # channel # if emotionsScore > 0: sum_emotionsScore += emotionsScore tempData.append('湖南卫视') # mentioned tempData.append(chunwan_times) # likability tempData.append(sum_emotionsScore) # date tempData.append(otherStyleTime) # program_id tempData.append('100') # program tempData.append('2016湖南卫视小年夜春晚') sqlcursor.execute( '''insert into gala_likability_mentioned(channel, mentioned, likability, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', tempData) sqlConn.commit() tempData = [] # 辽宁卫视2.6 # inter_ln = 93 # now = int(time.time())-86400*inter_ln # timeArray = time.localtime(now) # otherStyleTime = time.strftime("%Y-%m-%d", timeArray) # print otherStyleTime # # # 存储评论数据 # sqlcursor.execute('''SELECT mentioned from gala_mentioned_trend where date = %s and program = '2016年辽宁卫视春晚';''',(otherStyleTime,)) # bufferTemp = sqlcursor.fetchone() # chunwan_times = bufferTemp[0] # print chunwan_times # emProcess = emotionProcess() # rmIrr = removeIrrelevant() # tempData = [] # sum_emotionsScore = 0 # # bleats is json file # for row in bleats['Row']: # flag = True # for cell in row['Cell']: # columnname = base64.b64decode(cell['column']) # value = cell['$'] # if value == None: # print 'none' # continue # if columnname == "base_info:match": # column = base64.b64decode(value) # if ("辽宁" not in column): # flag = False # break # if columnname == "base_info:cdate": # cdate = base64.b64decode(value) # cdate = cdate.split('T')[0] # if cdate != otherStyleTime: # flag = False # break # if columnname == "base_info:text": # content = base64.b64decode(value) # # print 'q',content # # 情感关键词 # if flag: # (emotionsWord,emotionsScore) = emProcess.processSentence(rmIrr.removeEverythingButEmotion(content)) # # 倾向性判断flag:1是正面,0是中性,-1是负面 # # 情感极性判断,这里我限制了更严格的条件 # # channel # # if emotionsScore > 0: # sum_emotionsScore += emotionsScore # tempData.append('辽宁卫视') # # mentioned # tempData.append(chunwan_times) # # likability # tempData.append(sum_emotionsScore) # # date # tempData.append(otherStyleTime) # # program_id # tempData.append('100') # # program # tempData.append('2016年辽宁卫视春晚') # sqlcursor.execute('''insert into gala_likability_mentioned(channel, mentioned, likability, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''',tempData) # sqlConn.commit() # tempData = [] sqlConn.close()