def getData(id, package_name): total = BsUtil.praseJson( 'http://app.flyme.cn/apps/public/evaluate/list?app_id=%s&start=0&max=1' % id) conn, cur = DbUtil.getConn() totalComCount = total['value']['totalCount'] # 获取总下载量和评分 soup = BsUtil.praseHtml( 'http://app.flyme.cn/games/public/detail?package_name=%s' % package_name) totalScore = soup.find('div', class_="star_bg").attrs['data-num'] totalDownload = soup.find( text="下 载:").parent.next_sibling.next_sibling.string #获取游戏名 for child in soup.find('div', class_="detail_top").children: if (child.name == 'h3'): game_name = child.string cur.execute( 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % (game_name, 'meizu', totalComCount, totalScore, totalDownload, DateUtil.currentDate())) game_id = cur.lastrowid #获取所有评论内容 value = BsUtil.praseJson( 'http://app.flyme.cn/apps/public/evaluate/list?app_id=%s&start=0&max=%s' % (id, totalComCount)) for com in value['value']['list']: comment = html.unescape(com['comment']).replace("\"", "'") time = com['create_time'] author = html.unescape(com['user_name']).replace("\"", "'") score = com['star'] try: cur.execute( 'INSERT INTO comment(game_id, content, comment_time, author, score) ' 'VALUES ("%s", "%s", "%s", "%s", %d);' % (game_id, comment, time, author, score)) except: pass conn.commit() DbUtil.close(conn, cur)
def plot(game_name, game_id): dict = {} comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) for comment in comments: result = jieba.analyse.extract_tags(comment[2], topK=3) for word in result: if len(word) < 2: continue elif word in stop: continue if word not in dict: dict[word] = 1 else: dict[word] += 1 print(dict) swd = sorted(dict.items(), key=itemgetter(1), reverse=True) swd = swd[1:50] tags = make_tags(swd, minsize=30, maxsize=120, colors=random.choice(list(COLOR_SCHEMES.values()))) create_tag_image(tags, 'c:/%s.png' % game_name, background=(0, 0, 0, 255), size=(900, 600), fontname='SimHei') print('having save file to dick')
def getData(id, name, totalScore): page = 0 hasMore = True # 插入游戏 conn, cur = DbUtil.getConn() result = BsUtil.praseJson('http://market.xiaomi.com/apm/comment/list/%s?' 'clientId=2bb48bb54747e03a6ab667ab7b51050a&co=CN' '&la=zh&os=1461822601&page=%s&sdk=22' % (id, page)) totalComCount = result['pointCount'] print('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % ( name, 'xiaomi', totalComCount, totalScore * 10, 0, DateUtil.currentDate())) cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' % ( name, 'xiaomi', totalComCount, totalScore * 10, 0, DateUtil.currentDate())) game_id = cur.lastrowid # game_id = 0 while (hasMore): result = BsUtil.praseJson('http://market.xiaomi.com/apm/comment/list/%s?' 'clientId=2bb48bb54747e03a6ab667ab7b51050a&co=CN' '&la=zh&os=1461822601&page=%s&sdk=22' % (id, page)) # print(result) for comment in result['comments']: content = comment['commentValue'].replace("\"", "'").replace(" ", "") score = comment['pointValue'] time = comment['updateTime'] author = comment['nickname'].replace("\"", "'") # 插入评论 try: print('INSERT INTO comment(game_id, content, comment_time, author, score) ' 'VALUES ("%s", "%s", "%s", "%s", %d);' % ( game_id, content, DateUtil.longToStrTime(time / 1000), author, score)) cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 'VALUES ("%s", "%s", "%s", "%s", %d);' % ( game_id, content, DateUtil.longToStrTime(time / 1000), author, score)) except: print(sys.exc_info()[0], ":", sys.exc_info()[1]) pass page += 1 hasMore = result['hasMore'] conn.commit() DbUtil.close(conn, cur)
def plot(game_name, game_id): dict = {} comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) for comment in comments: result = jieba.analyse.extract_tags(comment[2], topK=3) for word in result: if len(word) < 2: continue elif word in stop: continue if word not in dict: dict[word] = 1 else: dict[word] += 1 swd = sorted(dict.items(), key=itemgetter(1), reverse=True) swd = swd[1:50] tags = make_tags(swd, minsize=30, maxsize=100, colors=random.choice(list(COLOR_SCHEMES.values()))) create_tag_image(tags, 'C:/Users/Administrator/Desktop/%s_%s.png' % (game_name, game_id), background=(0, 0, 0, 255), size=(900, 600), fontname='SimHei') print('create file ---%s' % game_name) # dict = {} # # for (k, v) in swd: # dict[k] = v # print('INSERT INTO keyword (game_id, keyword) VALUES (%s, "%s"' % (game_id, str(dict))) # cur.execute('INSERT INTO keyword (game_id, keyword) VALUES (%s, "%s")' % (game_id, str(dict))) # conn.commit() word = DbUtil.getOneResult('select keyword from keyword limit 1') print(eval(word[0]))
def getData(id,package_name): total = BsUtil.praseJson('http://app.flyme.cn/apps/public/evaluate/list?app_id=%s&start=0&max=1' % id) conn,cur = DbUtil.getConn() totalComCount = total['value']['totalCount'] # 获取总下载量和评分 soup = BsUtil.praseHtml('http://app.flyme.cn/games/public/detail?package_name=%s' % package_name) totalScore = soup.find('div', class_="star_bg").attrs['data-num'] totalDownload = soup.find(text="下 载:").parent.next_sibling.next_sibling.string #获取游戏名 for child in soup.find('div', class_="detail_top").children: if (child.name == 'h3'): game_name = child.string cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", "%s", "%s", "%s");' %(game_name,'meizu',totalComCount, totalScore,totalDownload,DateUtil.currentDate())) game_id = cur.lastrowid #获取所有评论内容 value = BsUtil.praseJson('http://app.flyme.cn/apps/public/evaluate/list?app_id=%s&start=0&max=%s'% (id,totalComCount)) for com in value['value']['list']: comment = html.unescape(com['comment']).replace("\"","'") time = com['create_time'] author = html.unescape(com['user_name']).replace("\"","'") score = com['star'] try: cur.execute('INSERT INTO comment(game_id, content, comment_time, author, score) ' 'VALUES ("%s", "%s", "%s", "%s", %d);' % (game_id,comment,time,author,score)) except: pass conn.commit() DbUtil.close(conn,cur)
def cut(): comments = DbUtil.getAllResult("select * from comment limit 300000") file = open("test1.txt", "w",encoding="utf-8") for comment in comments: list = [] result = jieba.cut(comment[2]) for word in result: if word not in stop and word != ' ': list.append(word) if list: file.write(" ".join(list)) file.write("\n") file.close() pass
import random from operator import itemgetter import jieba import jieba.analyse from pytagcloud import make_tags, create_tag_image from pytagcloud.colors import COLOR_SCHEMES from site.mybzz.util import DbUtil stop = [] conn, cur = DbUtil.getConn() def plot(game_name, game_id): dict = {} comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) for comment in comments: result = jieba.analyse.extract_tags(comment[2], topK=3) for word in result: if len(word) < 2: continue elif word in stop: continue if word not in dict: dict[word] = 1 else: dict[word] += 1
def toDict(list): return dict([(word, True) for word in list if word in best_words]) def features(feature_extraction_method): Features = [] for i in lists: words = feature_extraction_method(i) # 为积极文本赋予"pos" Features.append(words) return Features if __name__ == '__main__': getStop() comments = list(DbUtil.getAllResult("select * from comment")) shuffle(comments) conn, cur = DbUtil.getConn() for i in range(0, 1065000, 1000): print(i) for comment in comments[i:i + 1000]: print('UPDATE comment set comment_time = "%s" where id =%s' % (Ran.getTime(int(i / 1000 + 1) % 30 + 1), comment[0])) cur.execute('UPDATE comment set comment_time = "%s" where id =%s' % (Ran.getTime(int(i / 1000 + 1) % 30 + 1), comment[0])) conn.commit() DbUtil.close(conn, cur) # lists = [] # for comment in comments: # list = []
print(neg) if __name__ == '__main__': getStop() pos = pickle.load(open("pos_review.pkl", 'rb')) neg = pickle.load(open("neg_review.pkl", 'rb')) print(len(pos)) print(len(neg)) while [] in pos: pos.pop(pos.index([])) while [] in neg: neg.pop(neg.index([])) comments = list(DbUtil.getAllResult("select * from comment limit 10000 offset 600000")) shuffle(comments) comments = comments[:100] lists = [] for comment in comments: list = [] result = jieba.cut(comment[2]) for word in result: if word not in stop and word != ' ': list.append(word) lists.append(list) count = 0 frame = tkinter.Tk()
if __name__ == '__main__': getStop() pos = pickle.load(open("pos_review.pkl", 'rb')) neg = pickle.load(open("neg_review.pkl", 'rb')) print(len(pos)) print(len(neg)) while [] in pos: pos.pop(pos.index([])) while [] in neg: neg.pop(neg.index([])) comments = list( DbUtil.getAllResult("select * from comment limit 10000 offset 600000")) shuffle(comments) comments = comments[:100] lists = [] for comment in comments: list = [] result = jieba.cut(comment[2]) for word in result: if word not in stop and word != ' ': list.append(word) lists.append(list) count = 0 frame = tkinter.Tk()
create_tag_image(tags, 'c:/%s.png' % game_name, background=(0, 0, 0, 255), size=(900, 600), fontname='SimHei') print('having save file to dick') if __name__ == '__main__': f = open("../StopWords.txt", encoding="utf-8") jieba.load_userdict("c:/dict.txt") while True: line = f.readline().replace("\n", '') if not line: break stop.append(line) games = DbUtil.getAllResult( "select game_id,games.game_name from `comment` join games on game_id = games.id GROUP BY game_id ORDER BY count(game_id) desc limit 50" ) l = [] for game in games: if game[1] not in l: plot(game[1], game[0]) l.append(game[1])
def toDict(list): return dict([(word, True) for word in list if word in best_words]) def features(feature_extraction_method): Features = [] for i in lists: words = feature_extraction_method(i) # 为积极文本赋予"pos" Features.append(words) return Features if __name__ == '__main__': getStop() comments = list(DbUtil.getAllResult("select * from comment")) shuffle(comments) conn, cur = DbUtil.getConn() for i in range(0,1065000,1000): print(i) for comment in comments[i:i+1000]: print('UPDATE comment set comment_time = "%s" where id =%s' %(Ran.getTime(int(i/1000+1)%30+1),comment[0])) cur.execute('UPDATE comment set comment_time = "%s" where id =%s' %(Ran.getTime(int(i/1000+1)%30+1),comment[0])) conn.commit() DbUtil.close(conn,cur) # lists = [] # for comment in comments: # list = [] # result = jieba.cut(comment[2]) # for word in result:
import sys from site.mybzz.util import DbUtil from site.mybzz.util import BsUtil from site.mybzz.util import DateUtil conn, cur = DbUtil.getConn() def getData(name, id, score, totalDownload): commentUrl = "http://comment.mobilem.360.cn/comment/getComments?baike=%s&start=%s&count=%s" start, count = 0, 50 result = BsUtil.praseJson(commentUrl % (id, start, 1)) totalComCount = result['data']['total'] print( 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % (name, '360', totalComCount, (score * 10) / 2, totalDownload, DateUtil.currentDate())) cur.execute( 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % (name, '360', totalComCount, (score * 10) / 2, totalDownload, DateUtil.currentDate())) game_id = cur.lastrowid while (True): try: result = BsUtil.praseJson(commentUrl % (id, start, count)) except:
colors=random.choice(list(COLOR_SCHEMES.values()))) create_tag_image(tags, 'c:/%s.png' % game_name, background=(0, 0, 0, 255), size=(900, 600), fontname='SimHei') print('having save file to dick') if __name__ == '__main__': f = open("../StopWords.txt", encoding="utf-8") jieba.load_userdict("c:/dict.txt") while True: line = f.readline().replace("\n", '') if not line: break stop.append(line) games = DbUtil.getAllResult("select game_id,games.game_name from `comment` join games on game_id = games.id GROUP BY game_id ORDER BY count(game_id) desc limit 50") l =[] for game in games: if game[1] not in l: plot(game[1], game[0]) l.append(game[1])
# cur = conn.cursor() # # cur.execute("INSERT INTO comment(game_name, content, comment_time, author, score)" # " VALUES ('游戏名123', '内容123', '2016-05-19 15:56:07', 'ban', '44');") # conn.commit() # print("VALUES (%s, %s, %s, %s, %d);" % ('游戏名123', '内容123', '2016-05-19 15:56:07', 'ban', 44)) # print(time.localtime(1463739856)) # print(DateUtil.lomgToStrTime(1463739856)) # statement = "select * from comment" # # data =DbUtil.getAllResult(statement) # for d in data: # print("游戏名:%s,内容:%s,时间:%s" % (d[1],d[2],d[3])) conn, cur = DbUtil.getConn() # if __name__ == '__main__': # comments = DbUtil.getAllResult("select * from comment where game_id = 275 limit 10000") # file = open("c:/穿越火线_输入.txt", "w", encoding = "GBK") # for comment in comments: # try: # print(comment[2]) # file.write(comment[2]) # except: # pass word = DbUtil.getOneResult('select keyword from keyword limit 1') print(eval(word[0])) dict = eval(word[0])
import random from operator import itemgetter import jieba import jieba.analyse from pytagcloud import make_tags, create_tag_image from pytagcloud.colors import COLOR_SCHEMES from site.mybzz.util import DbUtil stop = [] conn, cur = DbUtil.getConn() def plot(game_name, game_id): dict = {} comments = DbUtil.getAllResult("select * from comment where game_id = %s" % game_id) for comment in comments: result = jieba.analyse.extract_tags(comment[2], topK=3) for word in result: if len(word) < 2: continue elif word in stop: continue if word not in dict: dict[word] = 1 else: dict[word] += 1 swd = sorted(dict.items(), key=itemgetter(1), reverse=True)
import sys from site.mybzz.util import DbUtil from site.mybzz.util import BsUtil from site.mybzz.util import DateUtil conn, cur = DbUtil.getConn() def getData(name, downloadCount, score, packageName): contextData = '' url = "http://sj.qq.com/myapp/app/comment.htm?apkName=%s&contextData=%s" totalComCount = 0 while totalComCount == 0: try: result = BsUtil.praseJson(url % (packageName, contextData)) totalComCount = result['obj']['total'] except: pass print( 'INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % ( name, 'qq', totalComCount, score, downloadCount, DateUtil.currentDate())) cur.execute('INSERT INTO games(game_name,from_store, total_comment_count, total_score, total_download, data_date) ' 'VALUES ("%s", "%s", "%s", %d, "%s", "%s");' % ( name, 'qq', totalComCount, score, downloadCount, DateUtil.currentDate())) game_id = cur.lastrowid while (True): try: