Example #1
0
    def classifySpeCluLsa(self, class_num):
        from draw_data import draw_data 
        draw_title = draw_data()
        lsa = models.LsiModel.load('model.lsa', mmap='r')
        logging.info("load lsa model!!")
        index = similarities.MatrixSimilarity.load('model_lsa.index')
        self.get_data(num=3000)
        (tfidf, dictionary) = self.get_tfidf(True, num=3000)

        hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用
        for i in range(len(self.title_id)):
            hash_id2list[self.title_id[i]] = i

        logging.info('开始创建相似矩阵...')
        similar_matrix = np.zeros((len(tfidf),len(tfidf))) #存放相似度
        for i in range(len(tfidf)):
            sims = index[lsa[tfidf[i]]]
            for j,v in enumerate(sims): 
                similar_matrix[i][j] = v
                similar_matrix[j][i] = v
        logging.info('done,相似矩阵建立完成,使用普聚类进行分类...')
        labels = spectral_clustering(similar_matrix, n_clusters=class_num, eigen_solver='arpack')
        self.vector_table = [[] for i in range(class_num)]
        for i in range(len(labels)):
            self.vector_table[labels[i]].append(self.title_id[i])
        logging.info("print set... "+str(len(self.vector_table)))
        self.printTitleTOfile(hash_id2list)
        draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
Example #2
0
    def classifyoptCluLsa(self):
        from draw_data import draw_data
        from optics_cluster import optics_cly
        draw_title = draw_data()
        lsa = models.LsiModel.load('model.lsa', mmap='r')
        logging.info("load lsa model!!")
        index = similarities.MatrixSimilarity.load('model_lsa.index')
        self.get_data(num=500)
        (tfidf, dictionary) = self.get_tfidf(True, num=500)

        hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用
        for i in range(len(self.title_id)):
            hash_id2list[self.title_id[i]] = i
        
        logging.info('开始创建相似距离...')
        similar_matrix = dict()
        for i in range(len(tfidf)):
            sims = index[lsa[tfidf[i]]]
            for j,v in enumerate(sims):
                if v >= 0.3:
                    similar_matrix[(self.title_id[i], self.title_id[j])] = 1 - v
        logging.info('done,建立完成,使用optics进行分类...')

        opc = optics_cly(0.2, 0.3, 7, similar_matrix, self.title_id)
        (self.vector_table, n_list) = opc.runOptics()
        self.print_nlist(hash_id2list, n_list)
        self.printTitleTOfile(hash_id2list)
        draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-12-25')
Example #3
0
def dataword(temp_k, begin_time=u"2015-09-25", last_time=u"2015-09-27"):
    import numpy as np
    getdate = draw_data()
    title_item = getdate.title_find_day(begin_time, last_time, 1000)
    news_list = list()
    tid = []
    for ti in title_item:
        x_time = [0 for i in range(350)]
        comment_item = getdate.db.CommentItem.find({"title_id": ti["_id"]})
        for ci in comment_item:
            tempk = time_between(begin_time, ci["time"], temp_k)
            if tempk < 350:
                x_time[tempk] += 1
        news_list.append(x_time[0:np.argmax(x_time)])
        tid.append((ti["_id"], comment_item.count()))
    compare_list = []
    for i in range(len(news_list)):
        for j in range(len(news_list)):
            if (i != j):
                uplist, orlist = compare_two(news_list[i], news_list[j])
                if len(uplist) or len(orlist):
                    compare_list.append([(tid[i], tid[j]), uplist, orlist])
    try:
        f = open('compare_list.txt', 'a')
        f.write(str(compare_list) + '\n')
        f.close()
    except Exception, e:
        logging.info(str(e))
Example #4
0
def dataword(temp_k, begin_time=u"2015-09-25", last_time = u"2015-09-27"):
    import numpy as np
    getdate = draw_data()
    title_item = getdate.title_find_day(begin_time, last_time, 1000)
    news_list = list()
    tid = []
    for ti in title_item:
        x_time = [0 for i in range(350)]
        comment_item = getdate.db.CommentItem.find(
            {"title_id": ti["_id"]}
        )
        for ci in comment_item:
            tempk = time_between(begin_time, ci["time"], temp_k)
            if tempk < 350:
                x_time[tempk] += 1
        news_list.append(x_time[0:np.argmax(x_time)])
        tid.append((ti["_id"], comment_item.count()))
    compare_list = []
    for i in range(len(news_list)):
        for j in range(len(news_list)):
            if(i != j):
                uplist,orlist = compare_two(news_list[i],news_list[j])
                if len(uplist) or len(orlist):
                     compare_list.append([(tid[i],tid[j]),uplist,orlist])
    try:
        f = open('compare_list.txt', 'a')
        f.write(str(compare_list)+'\n')
        f.close()
    except Exception,e:
        logging.info(str(e))
Example #5
0
def dataword(temp_k,
             all_days,
             each_item,
             begin_time=u"2015-09-25",
             last_time=u"2015-09-27"):
    import numpy as np
    getdate = draw_data()
    title_item = getdate.get_title_data(begin_time, last_time, 100)
    news_list = list()
    today_item = {}
    for i in title_item:
        today_item[i['_id']] = datetime.datetime.strptime(
            i['title_time'], "%Y-%m-%d %H:%M")
    for i in today_item.keys():
        onehours_time = today_item[i] + datetime.timedelta(hours=2.5)
        temp_befor = getdate.return_group(begin_time, str(onehours_time))
        temp_dict = {}
        for j in temp_befor:
            if j["_id"] in today_item:
                j_time = today_item[j["_id"]]
                minus = 1.0*(onehours_time-j_time).days*24*60 + \
                        1.0*(onehours_time-j_time).seconds/60
                temp_dict[j["_id"]] = [j["count"] / minus, j["count"]]
        topk = TOPk(temp_dict, 3)
        if len(topk) == 3:
            each_item[i] = topk
    all_days.append(today_item)
    print(len(today_item))
    print(len(each_item))
Example #6
0
    def dbscan_lsa(self, begin='2015-09-25', end='2015-11-25', num=10000):
        from draw_data import draw_data
        draw_title = draw_data()
        lsa = models.LsiModel.load('model.lsa', mmap='r')
        logging.info("load lsa model!!")
        index = similarities.MatrixSimilarity.load('model_lsa.index')
        self.title_name = []
        self.title_content = []
        self.title_id = []
        self.get_data(num=3000)
        (tfidf, dictionary) = self.get_tfidf(True, num=3000)
        #self.get_data(num=3000)
        #(tfidf, dictionary) = self.get_tfidf(False)

        hash_id2list = dict() # 保存id -> 下标 similar_matrix中对应使用
        for i in range(len(self.title_id)):
            hash_id2list[self.title_id[i]] = i
        
        logging.info("开始创造关联树...")
        for i in range(len(tfidf)):
            t_set_id = -1
            t_set = set()
            for t in range(len(self.vector_table)):
                if self.title_id[i] in self.vector_table[t]:
                    t_set_id = -2
                    break
            if t_set_id == -1:
                t_set.add(self.title_id[i])
                t_set = self.find_deep_dbscan(index, tfidf, lsa, tfidf[i], t_set, hash_id2list)
                if len(t_set) > 7:
                    print self.title_name[i]
                    self.vector_table.append(t_set)
        logging.info("print set... "+str(len(self.vector_table)))
        self.printTitleTOfile(hash_id2list)
        draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-10-10')
Example #7
0
def get_data():
    draw_mysql = draw_data()
    rmses = []
    for level in range(1000, 4200, 200):
        begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d')
        end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d')
        queue = []
        for i in range(7):
            print begin_time
            temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0)
            today_day = []
            for ti in temp:
                if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict:
                    today_day.append(ti['_id'])
            queue.append(today_day)
            begin_time = end_time
            end_time = end_time + datetime.timedelta(days=1)
        birds = None
        pred_position = None
        true_predict = []
        lbestFitList = []
        baseFitList = []
        for i in range(80):
            print str(begin_time) + " " + str(level)
            temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0)
            today_day = []
            for ti in temp:
                if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict:
                    today_day.append(ti['_id'])
            temp_position, birds, lbestfit, basefit = train_data(
                queue, birds, level)
            lbestFitList.append(lbestfit)
            baseFitList.append(basefit)
            if temp_position != None:
                pred_position = temp_position
                drawPosition(pred_position)
            true_predict = predict_data(pred_position, today_day, level,
                                        true_predict)
            queue.pop(0)
            queue.append(today_day)
            begin_time = end_time
            end_time = end_time + datetime.timedelta(days=1)
        # evaluation
        mape = 0.0
        for i in true_predict:
            mape += targetIn(i[0], i[2])
        mape /= len(true_predict)
        rmses.append(mape)
        print mape
    basemape = 0.0
    for i in true_predict:
        basemape += targetIn(i[1], i[2])
    basemape /= len(true_predict)
    plt.plot(range(1000, 4200, 200), rmses, 'b-')
    plt.plot([1000, 3900], [basemape, basemape], 'r-')
    plt.show()
Example #8
0
def process_data(begin='2015-09-25', end='2015-12-25', num=100):
    import re
    from draw_data import draw_data
    title = draw_data()
    title_item = title.get_title_data(begin, end, num)
    logging.info("start to get data...")
    for ti in title_item:
        title_name.append(ti['title_content'])
        title_content.append(ti['title_text'])
        title_id.append(ti['_id'])
    use_jieba()
Example #9
0
def process_data(begin='2015-09-25', end='2015-12-25', num=100):
    import re
    from draw_data import draw_data
    title = draw_data()
    title_item = title.get_title_data(begin, end, num)
    logging.info("start to get data...")
    for ti in title_item:
        title_name.append(ti['title_content'])
        title_content.append(ti['title_text'])
        title_id.append(ti['_id'])
    use_jieba()
Example #10
0
def user_pro():
    userlike = np.load("userlike.dict.npz")['arr_0'][()]
    db = draw_data()
    for titleid in classify_dict.keys():
        k += 1
        comment = db.title_comment_time(titleid, )
        comment = db.title_comment_aggregate(titleid)
        tempc = 0
        user_num = 0
        for ci in comment:
            tempc += 1
            if ci["_id"] in userLike:
                user_num += 1
def func(begin_time, end_time):
    each_count = []
    mydraw = draw_data()
    title_item = mydraw.get_title_data(str(begin_time), str(end_time), 0)
    for ti in title_item:
        each_count.append(mydraw.title_comment_count(ti['_id']))
    entropy_i = 0.0
    for count_i in each_count:
        pro_count = 1.0*count_i/np.sum(each_count)
        if pro_count != 0:
            entropy_i += ( -pro_count*np.log2(pro_count) )
    mydraw.close_db()
    logging.info(begin_time)
    return entropy_i
Example #12
0
 def get_data2(self, begin='2015-09-25', end='2015-11-25', num=500,
     update_top_num = True, contain_word = None):
     from draw_data import draw_data
     self.load_manual2(begin, end)
     Comment = draw_data()
     comments_temp = Comment.get_comment_data(begin, end)
     logging.info("评论共:" + str(comments_temp.count()))
     cci = 0
     for ci in comments_temp:
         cci += 1
         if cci%10000 == 0:
             logging.info('第 '+str(cci)+'个')
         self.use_jieba2(ci['comments'], begin, ci['time'], 30)
     self.printTitleTOfile2(begin, end, 30)
Example #13
0
 def get_data(self, begin='2015-09-25', end='2015-11-25', num=500,
     update_top_num = True, contain_word = None):
     from draw_data import draw_data
     self.load_manual()
     title = draw_data()
     title_item = title.get_title_data(begin, end, num)
     logging.info('总数: ' + str(title_item.count()))
     cti = 0
     for ti in title_item:
         cti += 1
         logging.info('第' + str(cti))
         titleid = ti['_id']
         commentcontent = ''
         comment2title = title.title_comment(titleid)
         for ci in comment2title:
             commentcontent = commentcontent + ci['comments'] + '。' 
         self.title_name[titleid] = ti['title_content']
         self.use_jieba(commentcontent, titleid)
     self.printTitleTOfile()
     # 画图
     draw_title = draw_data()
     draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-11-25') 
     print "文章(评论)总数:%d"%(len(self.title_name))
Example #14
0
def read_sourse(each_item, begin_time=u"2015-09-25", last_time=u"2015-09-27"):
    # 取出时间
    getdata = draw_data()
    title_item = getdata.get_title_data(begin_time, last_time, 100)
    for i in title_item:
        each_item[i['_id']] = [
            float(i['title_time'].split()[1].split(':')[0])
        ]  #取出时间作为特征
    logging.info(u"取出3个月的新闻,done")

    idfile = open("paragraph_name.txt", 'r')
    id_x = []
    for i in idfile.readlines():
        id_x.append(i.strip().split('\t')[1])
    idfile.close()

    vectorfile = open("sentence_vectors.txt", 'r')
    vector_list = []
    for lines in vectorfile.readlines():
        line = lines.strip().split(' ')
        tv = []
        for i in range(1, len(line)):
            tv.append(float(line[i]))
        vector_list.append(tv)
    vectorfile.close()
    logging.info(u"取出新闻的vector,done")

    for i in range(len(id_x)):
        if id_x[i] in each_item:
            each_item[id_x[i]] += vector_list[i]

    hours_file = open("2.5hour.txt", 'r')
    for i in hours_file.readlines():
        line = i.strip().split('\t')
        if float(line[3]) < 5:
            continue
        if line[0] in each_item:
            each_item[line[0]] += [
                float(line[1]),
                float(line[2]),
                float(line[3]),
                float(line[4])
            ]
            #each_item[line[0]].append([a1, a2, a3, np.log10(float(line[4]))])
    hours_file.close()
    # each_item 每个:0是时间;1-15是doc vector;16-18是1,1.5,2小时的评论量;19是最终热度
    np.savez("each_item", each_item)
    logging.info(u"写入文件each_item.npz,done")
Example #15
0
def user_like(num):
    userLike = dict()
    db = draw_data()
    user = db.return_user(num)
    titleUserNum = dict()
    for ti in user:
        userLike[ti["_id"]] = [0 for i in range(15)]
        titleUserNum[ti["_id"]] = dict()
        titleUserNum[ti["_id"]]["count"] = 0
    print len(userLike)
    topic_set = dict()
    user_all_num = 2551366
    titlenum = len(classify_dict)
    k = 0
    for titleid in classify_dict.keys():
        k += 1
        printUserNum = 0
        comment = db.title_comment_aggregate(titleid)
        tempc = 0
        for ci in comment:
            tempc += 1
            if ci["_id"] in userLike:
                printUserNum += 1
                titleUserNum[ci["_id"]][titleid] = ci["count"]
                titleUserNum[ci["_id"]]["count"] += ci["count"]
        topic_num = np.argmax(classify_dict[titleid])
        topic_set[titleid] = [topic_num, tempc]
        logging.info("process " + str(k) + "/" + str(titlenum) + ":" +
                     titleid + " " + str(topic_num) + " " + str(printUserNum) +
                     "/" + str(tempc))
    np.savez("topic_set.dict", topic_set)
    np.savez("titleUserNum.dict", titleUserNum)

    topic_set = np.load("topic_set.dict.npz")['arr_0'][()]
    titleUserNum = np.load("titleUserNum.dict.npz")['arr_0'][()]
    for uid in userLike:
        for titleid in classify_dict.keys():
            if titleid in titleUserNum[uid]:
                tf = 1.0 * titleUserNum[uid][titleid] / titleUserNum[uid][
                    "count"]
                idf = np.log2(1.0 * user_all_num / topic_set[titleid][1])
                userLike[uid][topic_set[titleid][0]] += tf * idf
        npsum = np.sum(userLike[uid])
        for i in range(15):
            userLike[uid][i] /= npsum
    np.savez("userlike.dict", userLike)
    return userLike
Example #16
0
def count_day(begin='2015-09-25',days=60):
    import datetime
    from matplotlib import pyplot as plt
    import numpy as np
    from draw_data import draw_data
    Comment = draw_data()
    begin = datetime.datetime.strptime(begin, '%Y-%m-%d')
    end = begin + datetime.timedelta(days=1)
    len_comment = []
    for i in range(days):
        comments_temp = Comment.get_comment_data(str(begin), str(end))
        len_comment.append(comments_temp.count())
        tempend = end
        end = end + datetime.timedelta(days=1)
        begin = tempend
    plt.bar(range(len(len_comment)), len_comment, width=1,align='edge', color='green')
    plt.show()
Example #17
0
 def get_data(self, begin='2015-09-25', end='2015-11-25'):
     from draw_data import draw_data
     Comment = draw_data()
     comments_temp = Comment.get_comment_data(begin, end)
     logging.info("评论共:" + str(comments_temp.count()))
     cci = 0
     comments_t = []
     tc = ''
     for ci in comments_temp:
         cci += 1
         if cci%1000 == 0:
             comments_t.append(tc)
             tc = ''
         else:
             tc = tc + ' ' + ci['comments']
     logging.info('合并之后: ' + str(len(comments_t)))
     return comments_t
Example #18
0
 def get_data(self, begin='2015-09-25', end='2015-11-25', num=500,
     update_top_num = True, contain_word = None):
     from draw_data import draw_data
     title = draw_data()
     title_item = title.get_title_data(begin, end, num)
     logging.info('总数: ' + str(title_item.count()))
     cti = 0
     for ti in title_item:
         cti += 1
         logging.info('第' + str(cti))
         self.title_name.append(ti['title_content'])
         titleid = ti['_id']
         comment2title = title.title_comment(titleid)
         comment_text = ""
         for ci in comment2title:
             comment_text  = comment_text + ci['comments'] + '。'
         self.title_content.append(comment_text)
         self.title_id.append(titleid)
     print "文章(评论)总数:%d"%(len(self.title_name))
Example #19
0
 def get_data(self, begin='2015-09-25', end='2015-11-25', num=10000,
     update_top_num = True, contain_word = None):
     import re
     from draw_data import draw_data
     title = draw_data()
     title_item = title.get_title_data(begin, end, num)
     if contain_word:
         for ti in title_item:
             title_text = ti['title_text']
             for word in contain_word:
                 if re.search(word, title_text):
                     self.title_name.append(ti['title_content'])
                     self.title_content.append(title_text)
                     break
     else:
         for ti in title_item:
             self.title_name.append(ti['title_content'])
             self.title_content.append(ti['title_text'])
     print "文章总数:%d"%(len(self.title_name))
     if update_top_num:
         self.topic_num = len(self.title_name)
Example #20
0
    def similarty_lsa(self, begin='2015-09-25', end='2015-11-25', num=10000):
        from draw_data import draw_data
        draw_title = draw_data()
        lsa = models.LsiModel.load('model.lsa', mmap='r')
        logging.info("load lsa model!!")
        index = similarities.MatrixSimilarity.load('model_lsa.index')
        self.title_name = []
        self.title_content = []
        self.get_data(num=10000)
        title_old_id = self.title_id[:]
        title_old_name = self.title_name[:]
        (tfidf, dictionary) = self.get_tfidf()
        self.title_name = []
        self.title_content = []
        self.title_id = []
        self.get_data(num=3000)
        (tfidf_less, dictionary) = self.get_tfidf(num=3000)

        hash_id2list = dict()
        for i in range(len(self.title_id)):
            hash_id2list[self.title_id[i]] = i
        
        logging.info("开始创造关联树...")
        for i in range(len(tfidf)):
            print title_old_name[i]
            t_set_id = -1
            t_set = set()
            for t in range(len(self.vector_table)):
                if title_old_id[i] in self.vector_table[t]:
                    t_set_id = -2
                    break
            if t_set_id == -1:
                self.vector_table.append(t_set)
                t_set_id = len(self.vector_table)-1
                t_set.add(title_old_id[i])
            if t_set_id >= 0:
                t_set = self.find_deep(index, tfidf_less, lsa, tfidf[i], t_set, hash_id2list)
        logging.info("print set... "+str(len(self.vector_table)))
        self.printTitleTOfile(hash_id2list)
        draw_title.draw_topic(self.vector_table, 30, '2015-09-25', '2015-10-10')
def entropy():
    import datetime
    entropy_day = []
    result_func = []
    mydraw = draw_data()
    begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d')
    end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d')
    pool = multiprocessing.Pool(processes=8)
    for i in range(60): 
        result_func.append( pool.apply_async(func, (begin_time, end_time, )) )
        begin_time = end_time
        end_time = end_time + datetime.timedelta(days = 1)
    pool.close()
    pool.join()
    for res in result_func:
        if res.get()<1:
            entropy_day.append(4.0)
        else:
            entropy_day.append(res.get())
    print entropy_day
    plt.plot(entropy_day)
    plt.show()
Example #22
0
def dataword(temp_k, all_days, each_item, begin_time=u"2015-09-25", last_time = u"2015-09-27"):
    import numpy as np
    getdate = draw_data()
    title_item = getdate.get_title_data(begin_time, last_time, 100)
    news_list = list()
    today_item = {}
    for i in title_item:
        today_item[i['_id']] = datetime.datetime.strptime(i['title_time'], "%Y-%m-%d %H:%M")
    for i in today_item.keys():
        onehours_time = today_item[i] + datetime.timedelta(hours=2.5)
        temp_befor = getdate.return_group(begin_time, str(onehours_time))
        temp_dict = {}
        for j in temp_befor:
            if j["_id"] in today_item:
                j_time = today_item[j["_id"]]
                minus = 1.0*(onehours_time-j_time).days*24*60 + \
                        1.0*(onehours_time-j_time).seconds/60
                temp_dict[j["_id"]] = [j["count"]/minus, j["count"]]
        topk = TOPk(temp_dict, 3)
        if len(topk) == 3:
            each_item[i] = topk
    all_days.append(today_item)
    print(len(today_item))
    print(len(each_item))
Example #23
0
#!/usr/bin/python
#-*- coding:utf-8 -*-
############################
#File Name: basic_onehours.py
#Author: yuxuan
#Created Time: 2016-04-24 19:00:28
############################
from draw_data import draw_data
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s",
        level=logging.INFO)

load_data = draw_data()
title_list = load_data.get_title_data('2015-09-25', '2015-12-25', 100)
logging.info("总共: " + str(title_list.count()))
all_file = open('all200.txt', 'w')
svm_train_file = open('basic_train.txt', 'w')
k = 0
for i in title_list:
    logging.info("process: " + str(k))
    k += 1
    titleid = i['_id']
    hours_count = load_data.one_hours_count(titleid, i['title_time']).count()
    all_count = load_data.title_comment(titleid).count()
    all_file.write(titleid + '\t' + str(hours_count) + '\t' +\
            str(all_count) + '\n')
    svm_train_file.write(str(all_count) + ' ' + '1:' + str(hours_count) + ' \n')

all_file.close()
svm_train_file.close()
Example #24
0
def get_data():
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    draw_mysql = draw_data()
    rmses = []
    mapes = []
    for level in range(500, 3500, 300):
        allposition = np.zeros(shape=(15, 15))
        begin_time = datetime.datetime.strptime('2015-09-27', '%Y-%m-%d')
        end_time = datetime.datetime.strptime('2015-09-28', '%Y-%m-%d')
        queue = []
        for i in range(3):
            print begin_time
            temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0)
            today_day = []
            for ti in temp:
                if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict:
                    today_day.append(ti['_id'])
            queue.append(today_day)
            begin_time = end_time
            end_time = end_time + datetime.timedelta(days=1)
        birds = None
        pred_position = None
        true_predict = []
        lbestFitList = []
        baseFitList = []
        xdate = []
        travel = np.zeros(shape=(0, 15))
        for i in range(15):  #87
            xdate.append(begin_time + datetime.timedelta(days=-1))
            print str(begin_time) + " " + str(level)
            temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0)
            today_day = []
            for ti in temp:
                if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict:
                    today_day.append(ti['_id'])
            birds = None
            temp_position, birds, lbestfit, basefit = train_data(
                queue, birds, level)
            if temp_position != None:
                lbestFitList.append(lbestfit)
                baseFitList.append(basefit)
                pred_position = temp_position
                travel = np.insert(travel,
                                   travel.shape[0],
                                   values=temp_position[9],
                                   axis=0)
                allposition = allposition + pred_position
                print begin_time + datetime.timedelta(days=-1)
                # drawPosition(pred_position, begin_time+datetime.timedelta(days = -1), level, 3)
            true_predict = predict_data(pred_position, today_day, level,
                                        true_predict)
            queue.pop(0)
            queue.append(today_day)
            begin_time = end_time
            end_time = end_time + datetime.timedelta(days=1)
        # plot_3d(xdate, travel)
        # evaluation
        print allposition / len(xdate)
        np.savetxt("allposition" + str(level) + ".csv",
                   allposition / len(xdate),
                   delimiter=', ')
        drawPosition(allposition / len(xdate),
                     begin_time + datetime.timedelta(days=-1), level, 0)
        mape = 0.0
        rmse = 0.0
        for i in true_predict:
            # mape += targetIn(i[0], i[2])
            mape += abs(i[0] - i[2]) / i[2]
            rmse += (i[0] - i[2])**2
        mape /= len(true_predict)
        rmse /= len(true_predict)
        rmse = np.sqrt(rmse)
        rmses.append(rmse)
        mapes.append(mape)
    basemape = 0.0
    basermse = 0.0
    for i in true_predict:
        # basemape += targetIn(i[1], i[2])
        basemape += abs(i[1] - i[2]) / i[2]
        rmse += (i[1] - i[2])**2
    basemape /= len(true_predict)
    basermse /= len(true_predict)
    basermse = np.sqrt(basermse)
    print "basemape:" + str(basemape)
    print "basermse:" + str(basermse)
    plt.plot(range(1000, 4000, 300), mapes, label="mape")
    plt.plot([1000, 4200], [basemape, basemape], label="basemape")
    np.savetxt("rmse.csv", rmses, delimiter=', ')
    np.savetxt("mape.csv", mapes, delimiter=", ")
    plt.show()
Example #25
0
                x_time[tempk] += 1
        news_list.append(x_time[0:np.argmax(x_time)])
        tid.append((ti["_id"], comment_item.count()))
    compare_list = []
    for i in range(len(news_list)):
        for j in range(len(news_list)):
            if (i != j):
                uplist, orlist = compare_two(news_list[i], news_list[j])
                if len(uplist) or len(orlist):
                    compare_list.append([(tid[i], tid[j]), uplist, orlist])
    try:
        f = open('compare_list.txt', 'a')
        f.write(str(compare_list) + '\n')
        f.close()
    except Exception, e:
        logging.info(str(e))
    return compare_list


if __name__ == "__main__":
    import datetime
    mydraw = draw_data()
    begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d')
    end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d')
    for i in range(60):
        print begin_time
        dataword(10, str(begin_time), str(end_time))
        begin_time = end_time
        end_time = end_time + datetime.timedelta(days=1)
    #dataword(5, str(begin_time),str(end_time))
Example #26
0
            if tempk < 350:
                x_time[tempk] += 1
        news_list.append(x_time[0:np.argmax(x_time)])
        tid.append((ti["_id"], comment_item.count()))
    compare_list = []
    for i in range(len(news_list)):
        for j in range(len(news_list)):
            if(i != j):
                uplist,orlist = compare_two(news_list[i],news_list[j])
                if len(uplist) or len(orlist):
                     compare_list.append([(tid[i],tid[j]),uplist,orlist])
    try:
        f = open('compare_list.txt', 'a')
        f.write(str(compare_list)+'\n')
        f.close()
    except Exception,e:
        logging.info(str(e))
    return compare_list

if __name__ == "__main__":
    import datetime
    mydraw = draw_data()
    begin_time = datetime.datetime.strptime('2015-09-25', '%Y-%m-%d')
    end_time = datetime.datetime.strptime('2015-09-26', '%Y-%m-%d')
    for i in range(60):
        print begin_time
        dataword(10, str(begin_time),str(end_time)) 
        begin_time = end_time
        end_time = end_time + datetime.timedelta(days = 1)
    #dataword(5, str(begin_time),str(end_time))
Example #27
0
def get_data():
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    draw_mysql = draw_data()
    rmses = []
    for level in range(1000, 4200, 200):
        allposition = np.zeros(shape=(15, 15))
        begin_time = datetime.datetime.strptime('2015-11-10', '%Y-%m-%d')
        end_time = datetime.datetime.strptime('2015-11-11', '%Y-%m-%d')
        queue = []
        for i in range(3):
            print begin_time
            temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0)
            today_day = []
            for ti in temp:
                if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict:
                    today_day.append(ti['_id'])
            queue.append(today_day)
            begin_time = end_time
            end_time = end_time + datetime.timedelta(days=1)
        birds = None
        pred_position = None
        true_predict = []
        lbestFitList = []
        baseFitList = []
        xdate = []
        travel = np.zeros(shape=(0, 15))
        for i in range(9):  #87
            xdate.append(begin_time + datetime.timedelta(days=-1))
            print str(begin_time) + " " + str(level)
            temp = draw_mysql.get_title_data(str(begin_time), str(end_time), 0)
            today_day = []
            for ti in temp:
                if ti['_id'] in dict_id_pred and ti['_id'] in classify_dict:
                    today_day.append(ti['_id'])
            birds = None
            temp_position, birds, lbestfit, basefit = train_data(
                queue, birds, level)
            lbestFitList.append(lbestfit)
            baseFitList.append(basefit)
            if temp_position != None:
                pred_position = temp_position
                travel = np.insert(travel,
                                   travel.shape[0],
                                   values=temp_position[8],
                                   axis=0)
                allposition = allposition + pred_position
                #drawPosition(pred_position, begin_time+datetime.timedelta(days = -1), level, 3)
            #true_predict = predict_data(pred_position, today_day, level, true_predict)
            queue.pop(0)
            queue.append(today_day)
            begin_time = end_time
            end_time = end_time + datetime.timedelta(days=1)
        plot_3d(xdate, travel)
        # evaluation
        print allposition / len(xdate)
        with open("matrix.txt", 'w') as f:
            f.write(str(allposition / len(xdate)))
        yearsFmt = mdates.DateFormatter('%Y-%m-%d')
        fig, ax = plt.subplots()
        ax.set_title(u"腾讯新闻2个月竞争预测图 level=" + str(level), fontsize=18)
        ax.set_xlabel("per day", fontsize=18)
        ax.xaxis.set_major_formatter(yearsFmt)
        ax.plot(xdate, lbestFitList, label=u"加入后")
        ax.plot(xdate, baseFitList, label=u"原始")
        ax.grid(True)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        plt.legend()
        plt.show()
        drawPosition(allposition / len(xdate),
                     begin_time + datetime.timedelta(days=-1), level, 0)
#导入类
from get_data import get_data
from draw_data import draw_data
get_data01 = get_data()
draw_data01 = draw_data()

i = 1
get_name = True

#界面:
while True:
    #选择操作
    str = '%s%s%s' % ('操作', i, ':')
    x = input(str)
    i+=1

    #退出
    if x=='quit' or i==102:
        print('what???')
        break


    #获取数据
    elif x=='get':
        print('ok,get and')
        y = input('which:')
        #获取单一公司一段时间股票
        if y=='stockdata':
            get_name = get_data01.get_stockData(get_name)
            print('stockdata done')
        #获取500强公司数据