def Export_Model(self, model_path):
     data = dict(word_dict=self.word_dict,
                 huffman=self.huffman,
                 vec_len=self.vec_len,
                 learn_rate=self.learn_rate,
                 win_len=self.win_len,
                 model=self.model)
     FI.save_pickle(data, model_path)
Beispiel #2
0
 def Export_Model(self,model_path):
     data=dict(
         word_dict = self.word_dict,
         huffman = self.huffman,
         vec_len = self.vec_len,
         learn_rate = self.learn_rate,
         win_len = self.win_len,
         model = self.model
     )
     FI.save_pickle(data,model_path)
    def post(self):

        try:
            user_basic_info=self.get_argument('user_basic_info')
            attends=self.get_argument('user_attends')
            user_basic_info=eval(user_basic_info)
            attends=eval(attends)
            self.write('success to return user info')
            self.finish()
        except:
            self.write('fail to return user info')
            self.finish()
            return

        try:
            dbi=MySQL_Interface()
        except:
            print('unable to connect to MySql DB')

        try:
            if attends.__len__()>0:           #store attends info
                table_name='cache_attends'
                attends_col_info=dbi.get_col_name(table_name)
                keys=attends[0].keys()
                attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends]
                fans_col_pos=attends_col_info.index('fans_num')
                insert_attends=[]
                for line in attends:
                    if line[fans_col_pos]>1000:
                        insert_attends.append(line)
                dbi.insert_asList(table_name,insert_attends,unique=True)
                print('Success : attends of {uid} is stored in {tname}'
                      .format(uid=user_basic_info['uid'],tname=table_name))
            else:
                pass
        except Exception as e:
            print(e)
            path="temp\\{uid}_attends.pkl".format(uid=user_basic_info['uid'])
            print('unable to store attends of {uid}, it will be stored '
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(attends,path)

        try:
            atten_num_real=user_basic_info['attends_num']
            atten_num_get=attends.__len__()
            user_basic_info['accuracy']=atten_num_get       # 实际获取到的关注数目
            col_info=dbi.get_col_name('cache_user_info')    # store user basic info
            keys=user_basic_info.keys()
            data=[user_basic_info[i] if i in keys else '' for i in col_info]
            dbi.insert_asList('cache_user_info',[data],unique=True)
            print('Success : basic info of {uid} is stored in cache_user_info'
                  .format(uid=user_basic_info['uid']))
        except Exception as e:
            print(e)
            path='temp\\{uid}_basic_info.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store basic info of {uid} , it will be stored'
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(user_basic_info,path)

        try:
            if attends.__len__()>0:            # store atten connection web
                from_uid=user_basic_info['uid']
                from_fans_num=user_basic_info['fans_num']
                from_blog_num=user_basic_info['blog_num']
                data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends]
                dbi.insert_asList('cache_atten_web',data)
                print('Success : conn web of {uid} is stored in cache_atten_web'
                      .format(uid=user_basic_info['uid']))
            else:
                pass
        except Exception as e:
            print(e)
            path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store atten web of {uid} , it will be stored'
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(data,path)
        if ret == 'dict':
            ret_data = {}
            for ele, count in temp[:high]:
                ret_data[ele] = count
            return ret_data
        else:
            return temp[:high]


if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text = [x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict, './static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
    # def cal_simi(data,key1,key2):
    #     return data[key1].dot(data[key2].T)[0][0]
    # keys=list(x.keys())
    # for key in keys:
    #     print(key,'\t',cal_simi(x,'姚明',key))
Beispiel #5
0
# 从wikileaks下载邮件内容并生成结构化数据
# ===============   ATTENTION   =============== :
# 由于邮件格式较繁杂,因此无法全部解析(5%左右)
base_url = 'https://wikileaks.org/dnc-emails/get/{page}'
base_path = '.\static\{page}.pkl'
gotten_id = os.listdir('.\static')
gotten_id = [int((x.split('.'))[0]) for x in gotten_id]
task_pool = list(range(1,2000))  # wikileak中的邮件编号
while True :
    if task_pool.__len__()==0:
        break
    task_id = task_pool.pop(0)
    if task_id in gotten_id:
        print('{id} skip'.format(id=task_id))
        continue
    url = base_url.format(page = task_id)
    path = base_path.format(page = task_id)
    try:
        info = getStructedData(url)
        FI.save_pickle(info,path)
        print('{t} succeed'.format(t=task_id))
    except Exception as e:
        # task_pool.append(task_id)
        print('{t} failed <--<--<--<--'.format(t=task_id))
        print(e)
print('文件已下载完毕')

# 生成社交网络数据(需要networkx包)
generate_data_for_gelphi()
print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))
Beispiel #6
0
        else:
            return temp[:high]


if __name__ == '__main__':
    # text = FI.load_pickle('./static/demo.pkl')
    # text =[ x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    # 输入文本
    data = [
        "小说”一词最早出现于《庄子·外物》:「饰小说以干县令,其于大达亦远矣。」庄子所谓的「小说」,是指琐碎的言论,与今日小说观念相差甚远。直至东汉桓谭《新论》:「小说家合残丛小语,近取譬喻,以作短书,治身理家,有可观之辞。」班固《汉书.艺文志》将「小说家」列为十家之后,其下的定义为:「小说家者流,盖出于稗官,街谈巷语,道听途说[4]之所造也。」才稍与今日小说的意义相近。而中国小说最大的特色,便自宋代开始具有文言小说与白话小说两种不同的小说系统。文言小说起源于先秦的街谈巷语,是一种小知小道的纪录。在历经魏晋南北朝及隋唐长期的发展,无论是题材或人物的描写,文言小说都有明显的进步,形成笔记与传奇两种小说类型。而白话小说则起源于唐宋时期说话人的话本,故事的取材来自民间,主要表现了百姓的生活及思想意识。但不管文言小说或白话小说都源远流长,呈现各自不同的艺术特色。"
    ]
    wv = Word2Vec(vec_len=500)  # vec 长度为500
    wv.Train_Model(data)

    FI.save_pickle(wv.word_dict, './static/wv.pkl')

    # 保存以后归一化
    data = FI.load_pickle('./static/wv.pkl')
    x = {}
    for key in data:
        temp = data[key]['vector']
        temp = preprocessing.normalize(temp)
        x[key] = temp
    FI.save_pickle(x, './static/normal_wv.pkl')

    x = FI.load_pickle('./static/normal_wv.pkl')

    # 计算两个词之间的相似度
    def cal_simi(data, key1, key2):
        return data[key1].dot(data[key2].T)[0][0]
    def post(self):

        try:
            user_basic_info = self.get_argument('user_basic_info')
            attends = self.get_argument('user_attends')
            user_basic_info = eval(user_basic_info)
            attends = eval(attends)
            self.write('success to return user info')
            self.finish()
        except:
            self.write('fail to return user info')
            self.finish()
            return

        try:
            dbi = MySQL_Interface()
        except:
            print('unable to connect to MySql DB')

        try:
            if attends.__len__() > 0:  #store attends info
                table_name = 'cache_attends'
                attends_col_info = dbi.get_col_name(table_name)
                keys = attends[0].keys()
                attends = [[
                    line[i] if i in keys else '' for i in attends_col_info
                ] for line in attends]
                fans_col_pos = attends_col_info.index('fans_num')
                insert_attends = []
                for line in attends:
                    if line[fans_col_pos] > 1000:
                        insert_attends.append(line)
                dbi.insert_asList(table_name, insert_attends, unique=True)
                print('Success : attends of {uid} is stored in {tname}'.format(
                    uid=user_basic_info['uid'], tname=table_name))
            else:
                pass
        except Exception as e:
            print(e)
            path = "temp" + os.sep + "{uid}_attends.pkl".format(
                uid=user_basic_info['uid'])
            print(
                'unable to store attends of {uid}, it will be stored '.format(
                    uid=user_basic_info['uid']))
            FI.save_pickle(attends, path)

        try:
            atten_num_real = user_basic_info['attends_num']
            atten_num_get = attends.__len__()
            user_basic_info['accuracy'] = atten_num_get  # 实际获取到的关注数目
            col_info = dbi.get_col_name(
                'cache_user_info')  # store user basic info
            keys = user_basic_info.keys()
            data = [user_basic_info[i] if i in keys else '' for i in col_info]
            dbi.insert_asList('cache_user_info', [data], unique=True)
            print('Success : basic info of {uid} is stored in cache_user_info'.
                  format(uid=user_basic_info['uid']))
        except Exception as e:
            print(e)
            path = 'temp' + os.sep + '{uid}_basic_info.pkl'.format(
                uid=user_basic_info['uid'])
            print('unable to store basic info of {uid} , it will be stored'.
                  format(uid=user_basic_info['uid']))
            FI.save_pickle(user_basic_info, path)

        try:
            if attends.__len__() > 0:  # store atten connection web
                from_uid = user_basic_info['uid']
                from_fans_num = user_basic_info['fans_num']
                from_blog_num = user_basic_info['blog_num']
                data = [[
                    from_uid, from_fans_num, from_blog_num,
                    str(x[attends_col_info.index('uid')]),
                    str(x[attends_col_info.index('fans_num')]),
                    str(x[attends_col_info.index('blog_num')])
                ] for x in attends]
                dbi.insert_asList('cache_atten_web', data)
                print(
                    'Success : conn web of {uid} is stored in cache_atten_web'.
                    format(uid=user_basic_info['uid']))
            else:
                pass
        except Exception as e:
            print(e)
            path = '{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store atten web of {uid} , it will be stored'.
                  format(uid=user_basic_info['uid']))
            FI.save_pickle(data, path)
                    res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)]
                else:
                    res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)]
        else:
            f={}
            for item in field:
                f[item]=1

            if sort=='':
                res=[pop_id(x) for x in collection.find(select,f).limit(limit)]
            else:
                if sort_type=='up':
                    res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)]
                else:
                    res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)]
    return res

def pop_id(data):
    data.pop('_id')
    return data

res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down')

data = FI.load_pickle('demo.pkl')
data = data + res
FI.save_pickle(data,'demo.pkl')
for line in res:
    print(line)
print(res.__len__())

Beispiel #9
0
                if sort_type == 'up':
                    res = [
                        pop_id(x)
                        for x in collection.find(select, f).limit(limit).sort(
                            sort, pymongo.ASCENDING)
                    ]
                else:
                    res = [
                        pop_id(x)
                        for x in collection.find(select, f).limit(limit).sort(
                            sort, pymongo.DESCENDING)
                    ]
    return res


def pop_id(data):
    data.pop('_id')
    return data


res = read_content_in_mongo(
    'latest_history', {'user_id': '1681029540'},
    ['dealed_text.left_content', 'created_at', 'user_name'], -1, 'id', 'down')

data = FI.load_pickle('demo.pkl')
data = data + res
FI.save_pickle(data, 'demo.pkl')
for line in res:
    print(line)
print(res.__len__())
                return []
        if ret=='dict':
            ret_data = {}
            for ele,count in temp[:high]:
                ret_data[ele]=count
            return ret_data
        else:
            return temp[:high]

if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text =[ x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
    # def cal_simi(data,key1,key2):
    #     return data[key1].dot(data[key2].T)[0][0]
    # keys=list(x.keys())
    # for key in keys:
    #     print(key,'\t',cal_simi(x,'姚明',key))
def generate_node_and_edge():
    pkl_list = os.listdir('.\static')
    node_list = {}
    edge_list = {}
    for path in pkl_list:
        info = FI.load_pickle('.\static\{x}'.format(x=path))
        # print(info)

        # 处理 From情况
        try:
            sender_mail = info['From']['mail']
            ori_mail = sender_mail
            sender_mail = sender_mail.lower()
        except:
            continue  # 如果其中没有From 则该邮件作废
        sender_node = node_list.get(sender_mail)
        if sender_node:
            sender_node['send'] += 1
        else:
            node_list[sender_mail] = dict(mail=sender_mail,
                                          send=1,
                                          receive=0,
                                          cc=0,
                                          ori_mail=ori_mail)
            if info.get('name'):
                node_list[sender_mail]['name'] = info['name']

        # 处理To情况
        receiver_list = info.get('To')
        if not receiver_list:
            continue
        for receiver in receiver_list:
            mail = receiver['mail']
            ori_mail = mail
            mail = mail.lower()
            node = node_list.get(mail)
            if node:
                node['receive'] += 1
            else:
                node_list[mail] = dict(mail=mail,
                                       send=0,
                                       receive=1,
                                       cc=0,
                                       ori_mail=ori_mail)
                if receiver.get('name'):
                    node_list[mail]['name'] = receiver['name']
            edge_key = sender_mail + '|----->' + mail
            edge = edge_list.get(edge_key)
            if edge:
                edge[2] += 1  # 如果该条边已经存在
            else:
                edge_list[edge_key] = [sender_mail, mail, 1]  #如果该条边不存在,则新加一条边

        # 处理CC的情况
        cc_list = info.get('CC')
        if cc_list:
            for cc in cc_list:
                mail = cc['mail']
                ori_mail = mail
                mail = mail.lower()
                node = node_list.get(mail)
                if node:
                    node['cc'] += 1
                else:
                    node_list[mail] = dict(mail=mail,
                                           send=0,
                                           receive=0,
                                           cc=1,
                                           ori_mail=ori_mail)
                    if cc.get('name'):
                        node_list[mail]['name'] = cc['name']
                edge_key = sender_mail + '|----->' + mail
                edge = edge_list.get(edge_key)
                if edge:
                    edge[2] += 1  # 如果该条边已经存在
                else:
                    edge_list[edge_key] = [sender_mail, mail,
                                           1]  #如果该条边不存在,则新加一条边
        print('{id} is dealed'.format(id=path))
    FI.save_pickle(node_list, '.\\temp_res\\node.pkl')
    FI.save_pickle(edge_list, '.\\temp_res\\edge.pkl')
Beispiel #12
0
        count += 1
        text = [x['dealed_text']['left_content'][0] for x in data]
        date = [x['created_timestamp'] for x in data]
        cutted_text = []
        for i in range(text.__len__()):
            s = list(jieba.cut(text[i]))
            for i in range(s.__len__())[::-1]:
                word = s[i]
                if word in stop_words:
                    s.pop(i)
            wfts.Add_Sentence_With_Timestamp(s, date[i])
        t3 = time.time()
        print('{x} is completed\t{t1}\t{t2}'.format(x=count,
                                                    t1=t2 - t1,
                                                    t2=t3 - t2))
    FI.save_pickle(wfts, './static/wfts_continue.pkl')

    wfts = FI.load_pickle('./static/wfts_continue.pkl')
    # for item in word_item_list:
    #     print('{a}\t{b}'.format(a=item.word,b=item.total_freq))
    # plt.plot([math.log(x.total_freq) for x in word_item_list])
    # plt.show()
    top_asDay = wfts.top_asDay
    for item in top_asDay:
        print('------------------------------')
        print(time.strftime("%Y-%m-%d", time.localtime(item['time'])))
        print(item['obj'].topN(20))

    # wfts = FI.load_pickle('./static/wfts_1w.pkl')
    # word_item_list = wfts.word_statistic.values()
    # word_item_list = sorted(word_item_list, key=lambda x:x.total_freq,reverse=True)
def generate_node_and_edge():
    pkl_list = os.listdir('.\static')
    node_list = {}
    edge_list = {}
    for path in pkl_list:
        info = FI.load_pickle('.\static\{x}'.format(x=path))
        # print(info)

        # 处理 From情况
        try:
            sender_mail = info['From']['mail']
            ori_mail = sender_mail
            sender_mail = sender_mail.lower()
        except:
            continue # 如果其中没有From 则该邮件作废
        sender_node = node_list.get(sender_mail)
        if sender_node:
            sender_node['send'] += 1
        else:
            node_list[sender_mail] = dict(
                mail = sender_mail,
                send = 1,
                receive = 0,
                cc = 0,
                ori_mail = ori_mail
            )
            if info.get('name'):
                node_list[sender_mail]['name'] = info['name']

        # 处理To情况
        receiver_list = info.get('To')
        if not receiver_list:
            continue
        for receiver in receiver_list:
            mail = receiver['mail']
            ori_mail = mail
            mail = mail.lower()
            node = node_list.get(mail)
            if node:
                node['receive'] += 1
            else:
                node_list[mail] = dict(
                    mail = mail,
                    send = 0 ,
                    receive = 1,
                    cc = 0,
                    ori_mail = ori_mail
                )
                if receiver.get('name'):
                    node_list[mail]['name'] = receiver['name']
            edge_key = sender_mail + '|----->' + mail
            edge = edge_list.get(edge_key)
            if edge:
                edge[2] += 1 # 如果该条边已经存在
            else:
                edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边

        # 处理CC的情况
        cc_list = info.get('CC')
        if cc_list:
            for cc in cc_list:
                mail = cc['mail']
                ori_mail = mail
                mail = mail.lower()
                node = node_list.get(mail)
                if node:
                    node['cc'] += 1
                else:
                    node_list[mail] = dict(
                        mail = mail,
                        send = 0 ,
                        receive = 0 ,
                        cc = 1,
                        ori_mail = ori_mail
                    )
                    if cc.get('name'):
                        node_list[mail]['name'] = cc['name']
                edge_key = sender_mail + '|----->' + mail
                edge = edge_list.get(edge_key)
                if edge:
                    edge[2] += 1 # 如果该条边已经存在
                else:
                    edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边
        print('{id} is dealed'.format(id=path))
    FI.save_pickle(node_list,'.\\temp_res\\node.pkl')
    FI.save_pickle(edge_list,'.\\temp_res\\edge.pkl')
        data = [x for x in data]
        t2 = time.time()
        count += 1
        text = [x['dealed_text']['left_content'][0] for x in data]
        date = [x['created_timestamp'] for x in data]
        cutted_text = []
        for i in range(text.__len__()):
            s = list(jieba.cut(text[i]))
            for i in range(s.__len__())[::-1]:
                word = s[i]
                if word in stop_words:
                    s.pop(i)
            wfts.Add_Sentence_With_Timestamp(s,date[i])
        t3 = time.time()
        print('{x} is completed\t{t1}\t{t2}'.format(x = count,t1=t2-t1,t2=t3-t2))
    FI.save_pickle(wfts,'./static/wfts_continue.pkl')

    wfts = FI.load_pickle('./static/wfts_continue.pkl')
    # for item in word_item_list:
    #     print('{a}\t{b}'.format(a=item.word,b=item.total_freq))
    # plt.plot([math.log(x.total_freq) for x in word_item_list])
    # plt.show()
    top_asDay = wfts.top_asDay
    for item in top_asDay:
        print('------------------------------')
        print(time.strftime("%Y-%m-%d",time.localtime(item['time'])))
        print(item['obj'].topN(20))

    # wfts = FI.load_pickle('./static/wfts_1w.pkl')
    # word_item_list = wfts.word_statistic.values()
    # word_item_list = sorted(word_item_list, key=lambda x:x.total_freq,reverse=True)
Beispiel #15
0
# 从wikileaks下载邮件内容并生成结构化数据
# ===============   ATTENTION   =============== :
# 由于邮件格式较繁杂,因此无法全部解析(5%左右)
base_url = 'https://wikileaks.org/dnc-emails/get/{page}'
base_path = '.\static\{page}.pkl'
gotten_id = os.listdir('.\static')
gotten_id = [int((x.split('.'))[0]) for x in gotten_id]
task_pool = list(range(1, 2000))  # wikileak中的邮件编号
while True:
    if task_pool.__len__() == 0:
        break
    task_id = task_pool.pop(0)
    if task_id in gotten_id:
        print('{id} skip'.format(id=task_id))
        continue
    url = base_url.format(page=task_id)
    path = base_path.format(page=task_id)
    try:
        info = getStructedData(url)
        FI.save_pickle(info, path)
        print('{t} succeed'.format(t=task_id))
    except Exception as e:
        # task_pool.append(task_id)
        print('{t} failed <--<--<--<--'.format(t=task_id))
        print(e)
print('文件已下载完毕')

# 生成社交网络数据(需要networkx包)
generate_data_for_gelphi()
print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))