def classifySpeCluLsa(self, class_num): from draw_data import draw_data draw_title = draw_data() lsa = models.LsiModel.load('model.lsa', mmap='r') logging.info("load lsa model!!") index = similarities.MatrixSimilarity.load('model_lsa.index') self.get_data(num=3000) (tfidf, dictionary) = self.get_tfidf(True, num=3000) hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用 for i in range(len(self.title_id)): hash_id2list[self.title_id[i]] = i logging.info('开始创建相似矩阵...') similar_matrix = np.zeros((len(tfidf),len(tfidf))) #存放相似度 for i in range(len(tfidf)): sims = index[lsa[tfidf[i]]] for j,v in enumerate(sims): similar_matrix[i][j] = v similar_matrix[j][i] = v logging.info('done,相似矩阵建立完成,使用普聚类进行分类...') labels = spectral_clustering(similar_matrix, n_clusters=class_num, eigen_solver='arpack') self.vector_table = [[] for i in range(class_num)] for i in range(len(labels)): self.vector_table[labels[i]].append(self.title_id[i]) logging.info("print set... "+str(len(self.vector_table))) self.printTitleTOfile(hash_id2list) draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
def classifyoptCluLsa(self): from draw_data import draw_data from optics_cluster import optics_cly draw_title = draw_data() lsa = models.LsiModel.load('model.lsa', mmap='r') logging.info("load lsa model!!") index = similarities.MatrixSimilarity.load('model_lsa.index') self.get_data(num=500) (tfidf, dictionary) = self.get_tfidf(True, num=500) hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用 for i in range(len(self.title_id)): hash_id2list[self.title_id[i]] = i logging.info('开始创建相似距离...') similar_matrix = dict() for i in range(len(tfidf)): sims = index[lsa[tfidf[i]]] for j,v in enumerate(sims): if v >= 0.3: similar_matrix[(self.title_id[i], self.title_id[j])] = 1 - v logging.info('done,建立完成,使用optics进行分类...') opc = optics_cly(0.2, 0.3, 7, similar_matrix, self.title_id) (self.vector_table, n_list) = opc.runOptics() self.print_nlist(hash_id2list, n_list) self.printTitleTOfile(hash_id2list) draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
def dataword(temp_k, begin_time=u"2015-09-25", last_time=u"2015-09-27"): import numpy as np getdate = draw_data() title_item = getdate.title_find_day(begin_time, last_time, 1000) news_list = list() tid = [] for ti in title_item: x_time = [0 for i in range(350)] comment_item = getdate.db.CommentItem.find({"title_id": ti["_id"]}) for ci in comment_item: tempk = time_between(begin_time, ci["time"], temp_k) if tempk < 350: x_time[tempk] += 1 news_list.append(x_time[0:np.argmax(x_time)]) tid.append((ti["_id"], comment_item.count())) compare_list = [] for i in range(len(news_list)): for j in range(len(news_list)): if (i != j): uplist, orlist = compare_two(news_list[i], news_list[j]) if len(uplist) or len(orlist): compare_list.append([(tid[i], tid[j]), uplist, orlist]) try: f = open('compare_list.txt', 'a') f.write(str(compare_list) + '\n') f.close() except Exception, e: logging.info(str(e))
def dataword(temp_k, begin_time=u"2015-09-25", last_time = u"2015-09-27"): import numpy as np getdate = draw_data() title_item = getdate.title_find_day(begin_time, last_time, 1000) news_list = list() tid = [] for ti in title_item: x_time = [0 for i in range(350)] comment_item = getdate.db.CommentItem.find( {"title_id": ti["_id"]} ) for ci in comment_item: tempk = time_between(begin_time, ci["time"], temp_k) if tempk < 350: x_time[tempk] += 1 news_list.append(x_time[0:np.argmax(x_time)]) tid.append((ti["_id"], comment_item.count())) compare_list = [] for i in range(len(news_list)): for j in range(len(news_list)): if(i != j): uplist,orlist = compare_two(news_list[i],news_list[j]) if len(uplist) or len(orlist): compare_list.append([(tid[i],tid[j]),uplist,orlist]) try: f = open('compare_list.txt', 'a') f.write(str(compare_list)+'\n') f.close() except Exception,e: logging.info(str(e))
def dataword(temp_k, all_days, each_item, begin_time=u"2015-09-25", last_time=u"2015-09-27"): import numpy as np getdate = draw_data() title_item = getdate.get_title_data(begin_time, last_time, 100) news_list = list() today_item = {} for i in title_item: today_item[i['_id']] = datetime.datetime.strptime( i['title_time'], "%Y-%m-%d %H:%M") for i in today_item.keys(): onehours_time = today_item[i] + datetime.timedelta(hours=2.5) temp_befor = getdate.return_group(begin_time, str(onehours_time)) temp_dict = {} for j in temp_befor: if j["_id"] in today_item: j_time = today_item[j["_id"]] minus = 1.0*(onehours_time-j_time).days*24*60 + \ 1.0*(onehours_time-j_time).seconds/60 temp_dict[j["_id"]] = [j["count"] / minus, j["count"]] topk = TOPk(temp_dict, 3) if len(topk) == 3: each_item[i] = topk all_days.append(today_item) print(len(today_item)) print(len(each_item))
def dbscan_lsa(self, begin='2015-09-25', end='2015-11-25', num=10000): from draw_data import draw_data draw_title = draw_data() lsa = models.LsiModel.load('model.lsa', mmap='r') logging.info("load lsa model!!") index = similarities.MatrixSimilarity.load('model_lsa.index') self.title_name = [] self.title_content = [] self.title_id = [] self.get_data(num=3000) (tfidf, dictionary) = self.get_tfidf(True, num=3000) #self.get_data(num=3000) #(tfidf, dictionary) = self.get_tfidf(False) hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用 for i in range(len(self.title_id)): hash_id2list[self.title_id[i]] = i logging.info("开始创造关联树...") for i in range(len(tfidf)): t_set_id = -1 t_set = set() for t in range(len(self.vector_table)): if self.title_id[i] in self.vector_table[t]: t_set_id = -2 break if t_set_id == -1: t_set.add(self.title_id[i]) t_set = self.find_deep_dbscan(index, tfidf, lsa, tfidf[i], t_set, hash_id2list) if len(t_set) > 7: print self.title_name[i] self.vector_table.append(t_set) logging.info("print set... "+str(len(self.vector_table))) self.printTitleTOfile(hash_id2list) draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-10-10')
def get_data(): draw_mysql = draw_data() rmses = [] for level in range(1000, 4200, 200): begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d') end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d') queue = [] for i in range(7): print begin_time temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0) today_day = [] for ti in temp: if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict: today_day.append(ti['_id']) queue.append(today_day) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) birds = None pred_position = None true_predict = [] lbestFitList = [] baseFitList = [] for i in range(80): print str(begin_time) + " " + str(level) temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0) today_day = [] for ti in temp: if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict: today_day.append(ti['_id']) temp_position, birds, lbestfit, basefit = train_data( queue, birds, level) lbestFitList.append(lbestfit) baseFitList.append(basefit) if temp_position != None: pred_position = temp_position drawPosition(pred_position) true_predict = predict_data(pred_position, today_day, level, true_predict) queue.pop(0) queue.append(today_day) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) # evaluation mape = 0.0 for i in true_predict: mape += targetIn(i[0], i[2]) mape /= len(true_predict) rmses.append(mape) print mape basemape = 0.0 for i in true_predict: basemape += targetIn(i[1], i[2]) basemape /= len(true_predict) plt.plot(range(1000, 4200, 200), rmses, 'b-') plt.plot([1000, 3900], [basemape, basemape], 'r-') plt.show()
def process_data(begin='2015-09-25', end='2015-12-25', num=100): import re from draw_data import draw_data title = draw_data() title_item = title.get_title_data(begin, end, num) logging.info("start to get data...") for ti in title_item: title_name.append(ti['title_content']) title_content.append(ti['title_text']) title_id.append(ti['_id']) use_jieba()
def user_pro(): userlike = np.load("userlike.dict.npz")['arr_0'][()] db = draw_data() for titleid in classify_dict.keys(): k += 1 comment = db.title_comment_time(titleid, ) comment = db.title_comment_aggregate(titleid) tempc = 0 user_num = 0 for ci in comment: tempc += 1 if ci["_id"] in userLike: user_num += 1
def func(begin_time, end_time): each_count = [] mydraw = draw_data() title_item = mydraw.get_title_data(str(begin_time), str(end_time), 0) for ti in title_item: each_count.append(mydraw.title_comment_count(ti['_id'])) entropy_i = 0.0 for count_i in each_count: pro_count = 1.0*count_i/np.sum(each_count) if pro_count != 0: entropy_i += ( -pro_count*np.log2(pro_count) ) mydraw.close_db() logging.info(begin_time) return entropy_i
def get_data2(self, begin='2015-09-25', end='2015-11-25', num=500, update_top_num = True, contain_word = None): from draw_data import draw_data self.load_manual2(begin, end) Comment = draw_data() comments_temp = Comment.get_comment_data(begin, end) logging.info("评论共:" + str(comments_temp.count())) cci = 0 for ci in comments_temp: cci += 1 if cci%10000 == 0: logging.info('第 '+str(cci)+'个') self.use_jieba2(ci['comments'], begin, ci['time'], 30) self.printTitleTOfile2(begin, end, 30)
def get_data(self, begin='2015-09-25', end='2015-11-25', num=500, update_top_num = True, contain_word = None): from draw_data import draw_data self.load_manual() title = draw_data() title_item = title.get_title_data(begin, end, num) logging.info('总数: ' + str(title_item.count())) cti = 0 for ti in title_item: cti += 1 logging.info('第' + str(cti)) titleid = ti['_id'] commentcontent = '' comment2title = title.title_comment(titleid) for ci in comment2title: commentcontent = commentcontent + ci['comments'] + '。' self.title_name[titleid] = ti['title_content'] self.use_jieba(commentcontent, titleid) self.printTitleTOfile() # 画图 draw_title = draw_data() draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-11-25') print "文章(评论)总数:%d"%(len(self.title_name))
def read_sourse(each_item, begin_time=u"2015-09-25", last_time=u"2015-09-27"): # 取出时间 getdata = draw_data() title_item = getdata.get_title_data(begin_time, last_time, 100) for i in title_item: each_item[i['_id']] = [ float(i['title_time'].split()[1].split(':')[0]) ] #取出时间作为特征 logging.info(u"取出3个月的新闻,done") idfile = open("paragraph_name.txt", 'r') id_x = [] for i in idfile.readlines(): id_x.append(i.strip().split('\t')[1]) idfile.close() vectorfile = open("sentence_vectors.txt", 'r') vector_list = [] for lines in vectorfile.readlines(): line = lines.strip().split(' ') tv = [] for i in range(1, len(line)): tv.append(float(line[i])) vector_list.append(tv) vectorfile.close() logging.info(u"取出新闻的vector,done") for i in range(len(id_x)): if id_x[i] in each_item: each_item[id_x[i]] += vector_list[i] hours_file = open("2.5hour.txt", 'r') for i in hours_file.readlines(): line = i.strip().split('\t') if float(line[3]) < 5: continue if line[0] in each_item: each_item[line[0]] += [ float(line[1]), float(line[2]), float(line[3]), float(line[4]) ] #each_item[line[0]].append([a1, a2, a3, np.log10(float(line[4]))]) hours_file.close() # each_item 每个:0是时间;1-15是doc vector;16-18是1,1.5,2小时的评论量;19是最终热度 np.savez("each_item", each_item) logging.info(u"写入文件each_item.npz,done")
def user_like(num): userLike = dict() db = draw_data() user = db.return_user(num) titleUserNum = dict() for ti in user: userLike[ti["_id"]] = [0 for i in range(15)] titleUserNum[ti["_id"]] = dict() titleUserNum[ti["_id"]]["count"] = 0 print len(userLike) topic_set = dict() user_all_num = 2551366 titlenum = len(classify_dict) k = 0 for titleid in classify_dict.keys(): k += 1 printUserNum = 0 comment = db.title_comment_aggregate(titleid) tempc = 0 for ci in comment: tempc += 1 if ci["_id"] in userLike: printUserNum += 1 titleUserNum[ci["_id"]][titleid] = ci["count"] titleUserNum[ci["_id"]]["count"] += ci["count"] topic_num = np.argmax(classify_dict[titleid]) topic_set[titleid] = [topic_num, tempc] logging.info("process " + str(k) + "/" + str(titlenum) + ":" + titleid + " " + str(topic_num) + " " + str(printUserNum) + "/" + str(tempc)) np.savez("topic_set.dict", topic_set) np.savez("titleUserNum.dict", titleUserNum) topic_set = np.load("topic_set.dict.npz")['arr_0'][()] titleUserNum = np.load("titleUserNum.dict.npz")['arr_0'][()] for uid in userLike: for titleid in classify_dict.keys(): if titleid in titleUserNum[uid]: tf = 1.0 * titleUserNum[uid][titleid] / titleUserNum[uid][ "count"] idf = np.log2(1.0 * user_all_num / topic_set[titleid][1]) userLike[uid][topic_set[titleid][0]] += tf * idf npsum = np.sum(userLike[uid]) for i in range(15): userLike[uid][i] /= npsum np.savez("userlike.dict", userLike) return userLike
def count_day(begin='2015-09-25',days=60): import datetime from matplotlib import pyplot as plt import numpy as np from draw_data import draw_data Comment = draw_data() begin = datetime.datetime.strptime(begin, '%Y-%m-%d') end = begin + datetime.timedelta(days=1) len_comment = [] for i in range(days): comments_temp = Comment.get_comment_data(str(begin), str(end)) len_comment.append(comments_temp.count()) tempend = end end = end + datetime.timedelta(days=1) begin = tempend plt.bar(range(len(len_comment)), len_comment, width=1,align='edge', color='green') plt.show()
def get_data(self, begin='2015-09-25', end='2015-11-25'): from draw_data import draw_data Comment = draw_data() comments_temp = Comment.get_comment_data(begin, end) logging.info("评论共:" + str(comments_temp.count())) cci = 0 comments_t = [] tc = '' for ci in comments_temp: cci += 1 if cci%1000 == 0: comments_t.append(tc) tc = '' else: tc = tc + ' ' + ci['comments'] logging.info('合并之后: ' + str(len(comments_t))) return comments_t
def get_data(self, begin='2015-09-25', end='2015-11-25', num=500, update_top_num = True, contain_word = None): from draw_data import draw_data title = draw_data() title_item = title.get_title_data(begin, end, num) logging.info('总数: ' + str(title_item.count())) cti = 0 for ti in title_item: cti += 1 logging.info('第' + str(cti)) self.title_name.append(ti['title_content']) titleid = ti['_id'] comment2title = title.title_comment(titleid) comment_text = "" for ci in comment2title: comment_text = comment_text + ci['comments'] + '。' self.title_content.append(comment_text) self.title_id.append(titleid) print "文章(评论)总数:%d"%(len(self.title_name))
def get_data(self, begin='2015-09-25', end='2015-11-25', num=10000, update_top_num = True, contain_word = None): import re from draw_data import draw_data title = draw_data() title_item = title.get_title_data(begin, end, num) if contain_word: for ti in title_item: title_text = ti['title_text'] for word in contain_word: if re.search(word, title_text): self.title_name.append(ti['title_content']) self.title_content.append(title_text) break else: for ti in title_item: self.title_name.append(ti['title_content']) self.title_content.append(ti['title_text']) print "文章总数:%d"%(len(self.title_name)) if update_top_num: self.topic_num = len(self.title_name)
def similarty_lsa(self, begin='2015-09-25', end='2015-11-25', num=10000): from draw_data import draw_data draw_title = draw_data() lsa = models.LsiModel.load('model.lsa', mmap='r') logging.info("load lsa model!!") index = similarities.MatrixSimilarity.load('model_lsa.index') self.title_name = [] self.title_content = [] self.get_data(num=10000) title_old_id = self.title_id[:] title_old_name = self.title_name[:] (tfidf, dictionary) = self.get_tfidf() self.title_name = [] self.title_content = [] self.title_id = [] self.get_data(num=3000) (tfidf_less, dictionary) = self.get_tfidf(num=3000) hash_id2list = dict() for i in range(len(self.title_id)): hash_id2list[self.title_id[i]] = i logging.info("开始创造关联树...") for i in range(len(tfidf)): print title_old_name[i] t_set_id = -1 t_set = set() for t in range(len(self.vector_table)): if title_old_id[i] in self.vector_table[t]: t_set_id = -2 break if t_set_id == -1: self.vector_table.append(t_set) t_set_id = len(self.vector_table)-1 t_set.add(title_old_id[i]) if t_set_id >= 0: t_set = self.find_deep(index, tfidf_less, lsa, tfidf[i], t_set, hash_id2list) logging.info("print set... "+str(len(self.vector_table))) self.printTitleTOfile(hash_id2list) draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-10-10')
def entropy(): import datetime entropy_day = [] result_func = [] mydraw = draw_data() begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d') end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d') pool = multiprocessing.Pool(processes=8) for i in range(60): result_func.append( pool.apply_async(func, (begin_time, end_time, )) ) begin_time = end_time end_time = end_time + datetime.timedelta(days = 1) pool.close() pool.join() for res in result_func: if res.get()<1: entropy_day.append(4.0) else: entropy_day.append(res.get()) print entropy_day plt.plot(entropy_day) plt.show()
def dataword(temp_k, all_days, each_item, begin_time=u"2015-09-25", last_time = u"2015-09-27"): import numpy as np getdate = draw_data() title_item = getdate.get_title_data(begin_time, last_time, 100) news_list = list() today_item = {} for i in title_item: today_item[i['_id']] = datetime.datetime.strptime(i['title_time'], "%Y-%m-%d %H:%M") for i in today_item.keys(): onehours_time = today_item[i] + datetime.timedelta(hours=2.5) temp_befor = getdate.return_group(begin_time, str(onehours_time)) temp_dict = {} for j in temp_befor: if j["_id"] in today_item: j_time = today_item[j["_id"]] minus = 1.0*(onehours_time-j_time).days*24*60 + \ 1.0*(onehours_time-j_time).seconds/60 temp_dict[j["_id"]] = [j["count"]/minus, j["count"]] topk = TOPk(temp_dict, 3) if len(topk) == 3: each_item[i] = topk all_days.append(today_item) print(len(today_item)) print(len(each_item))
#!/usr/bin/python #-*- coding:utf-8 -*- ############################ #File Name: basic_onehours.py #Author: yuxuan #Created Time: 2016-04-24 19:00:28 ############################ from draw_data import draw_data import logging logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) load_data = draw_data() title_list = load_data.get_title_data('2015-09-25', '2015-12-25', 100) logging.info("总共: " + str(title_list.count())) all_file = open('all200.txt', 'w') svm_train_file = open('basic_train.txt', 'w') k = 0 for i in title_list: logging.info("process: " + str(k)) k += 1 titleid = i['_id'] hours_count = load_data.one_hours_count(titleid, i['title_time']).count() all_count = load_data.title_comment(titleid).count() all_file.write(titleid + '\t' + str(hours_count) + '\t' +\ str(all_count) + '\n') svm_train_file.write(str(all_count) + ' ' + '1:' + str(hours_count) + ' \n') all_file.close() svm_train_file.close()
def get_data(): import matplotlib.pyplot as plt import matplotlib.dates as mdates draw_mysql = draw_data() rmses = [] mapes = [] for level in range(500, 3500, 300): allposition = np.zeros(shape=(15, 15)) begin_time = datetime.datetime.strptime('2015-09-27', '%Y-%m-%d') end_time = datetime.datetime.strptime('2015-09-28', '%Y-%m-%d') queue = [] for i in range(3): print begin_time temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0) today_day = [] for ti in temp: if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict: today_day.append(ti['_id']) queue.append(today_day) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) birds = None pred_position = None true_predict = [] lbestFitList = [] baseFitList = [] xdate = [] travel = np.zeros(shape=(0, 15)) for i in range(15): #87 xdate.append(begin_time + datetime.timedelta(days=-1)) print str(begin_time) + " " + str(level) temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0) today_day = [] for ti in temp: if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict: today_day.append(ti['_id']) birds = None temp_position, birds, lbestfit, basefit = train_data( queue, birds, level) if temp_position != None: lbestFitList.append(lbestfit) baseFitList.append(basefit) pred_position = temp_position travel = np.insert(travel, travel.shape[0], values=temp_position[9], axis=0) allposition = allposition + pred_position print begin_time + datetime.timedelta(days=-1) # drawPosition(pred_position, begin_time+datetime.timedelta(days = -1), level, 3) true_predict = predict_data(pred_position, today_day, level, true_predict) queue.pop(0) queue.append(today_day) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) # plot_3d(xdate, travel) # evaluation print allposition / len(xdate) np.savetxt("allposition" + str(level) + ".csv", allposition / len(xdate), delimiter=', ') drawPosition(allposition / len(xdate), begin_time + datetime.timedelta(days=-1), level, 0) mape = 0.0 rmse = 0.0 for i in true_predict: # mape += targetIn(i[0], i[2]) mape += abs(i[0] - i[2]) / i[2] rmse += (i[0] - i[2])**2 mape /= len(true_predict) rmse /= len(true_predict) rmse = np.sqrt(rmse) rmses.append(rmse) mapes.append(mape) basemape = 0.0 basermse = 0.0 for i in true_predict: # basemape += targetIn(i[1], i[2]) basemape += abs(i[1] - i[2]) / i[2] rmse += (i[1] - i[2])**2 basemape /= len(true_predict) basermse /= len(true_predict) basermse = np.sqrt(basermse) print "basemape:" + str(basemape) print "basermse:" + str(basermse) plt.plot(range(1000, 4000, 300), mapes, label="mape") plt.plot([1000, 4200], [basemape, basemape], label="basemape") np.savetxt("rmse.csv", rmses, delimiter=', ') np.savetxt("mape.csv", mapes, delimiter=", ") plt.show()
x_time[tempk] += 1 news_list.append(x_time[0:np.argmax(x_time)]) tid.append((ti["_id"], comment_item.count())) compare_list = [] for i in range(len(news_list)): for j in range(len(news_list)): if (i != j): uplist, orlist = compare_two(news_list[i], news_list[j]) if len(uplist) or len(orlist): compare_list.append([(tid[i], tid[j]), uplist, orlist]) try: f = open('compare_list.txt', 'a') f.write(str(compare_list) + '\n') f.close() except Exception, e: logging.info(str(e)) return compare_list if __name__ == "__main__": import datetime mydraw = draw_data() begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d') end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d') for i in range(60): print begin_time dataword(10, str(begin_time), str(end_time)) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) #dataword(5, str(begin_time),str(end_time))
if tempk < 350: x_time[tempk] += 1 news_list.append(x_time[0:np.argmax(x_time)]) tid.append((ti["_id"], comment_item.count())) compare_list = [] for i in range(len(news_list)): for j in range(len(news_list)): if(i != j): uplist,orlist = compare_two(news_list[i],news_list[j]) if len(uplist) or len(orlist): compare_list.append([(tid[i],tid[j]),uplist,orlist]) try: f = open('compare_list.txt', 'a') f.write(str(compare_list)+'\n') f.close() except Exception,e: logging.info(str(e)) return compare_list if __name__ == "__main__": import datetime mydraw = draw_data() begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d') end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d') for i in range(60): print begin_time dataword(10, str(begin_time),str(end_time)) begin_time = end_time end_time = end_time + datetime.timedelta(days = 1) #dataword(5, str(begin_time),str(end_time))
def get_data(): import matplotlib.pyplot as plt import matplotlib.dates as mdates draw_mysql = draw_data() rmses = [] for level in range(1000, 4200, 200): allposition = np.zeros(shape=(15, 15)) begin_time = datetime.datetime.strptime('2015-11-10', '%Y-%m-%d') end_time = datetime.datetime.strptime('2015-11-11', '%Y-%m-%d') queue = [] for i in range(3): print begin_time temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0) today_day = [] for ti in temp: if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict: today_day.append(ti['_id']) queue.append(today_day) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) birds = None pred_position = None true_predict = [] lbestFitList = [] baseFitList = [] xdate = [] travel = np.zeros(shape=(0, 15)) for i in range(9): #87 xdate.append(begin_time + datetime.timedelta(days=-1)) print str(begin_time) + " " + str(level) temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0) today_day = [] for ti in temp: if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict: today_day.append(ti['_id']) birds = None temp_position, birds, lbestfit, basefit = train_data( queue, birds, level) lbestFitList.append(lbestfit) baseFitList.append(basefit) if temp_position != None: pred_position = temp_position travel = np.insert(travel, travel.shape[0], values=temp_position[8], axis=0) allposition = allposition + pred_position #drawPosition(pred_position, begin_time+datetime.timedelta(days = -1), level, 3) #true_predict = predict_data(pred_position, today_day, level, true_predict) queue.pop(0) queue.append(today_day) begin_time = end_time end_time = end_time + datetime.timedelta(days=1) plot_3d(xdate, travel) # evaluation print allposition / len(xdate) with open("matrix.txt", 'w') as f: f.write(str(allposition / len(xdate))) yearsFmt = mdates.DateFormatter('%Y-%m-%d') fig, ax = plt.subplots() ax.set_title(u"腾讯新闻2个月竞争预测图 level=" + str(level), fontsize=18) ax.set_xlabel("per day", fontsize=18) ax.xaxis.set_major_formatter(yearsFmt) ax.plot(xdate, lbestFitList, label=u"加入后") ax.plot(xdate, baseFitList, label=u"原始") ax.grid(True) plt.xticks(fontsize=18) plt.yticks(fontsize=18) plt.legend() plt.show() drawPosition(allposition / len(xdate), begin_time + datetime.timedelta(days=-1), level, 0)
#导入类 from get_data import get_data from draw_data import draw_data get_data01 = get_data() draw_data01 = draw_data() i = 1 get_name = True #界面: while True: #选择操作 str = '%s%s%s' % ('操作', i, ':') x = input(str) i+=1 #退出 if x=='quit' or i==102: print('what???') break #获取数据 elif x=='get': print('ok,get and') y = input('which:') #获取单一公司一段时间股票 if y=='stockdata': get_name = get_data01.get_stockData(get_name) print('stockdata done') #获取500强公司数据