Example #1
0
 def Export_Model(self, model_path):
     data = dict(word_dict=self.word_dict,
                 huffman=self.huffman,
                 vec_len=self.vec_len,
                 learn_rate=self.learn_rate,
                 win_len=self.win_len,
                 model=self.model)
     FI.save_pickle(data, model_path)
Example #2
0
 def Export_Model(self,model_path):
     data=dict(
         word_dict = self.word_dict,
         huffman = self.huffman,
         vec_len = self.vec_len,
         learn_rate = self.learn_rate,
         win_len = self.win_len,
         model = self.model
     )
     FI.save_pickle(data,model_path)
def generate_data_for_gelphi():
    generate_node_and_edge()
    nodes = FI.load_pickle('.\\temp_res\\node.pkl')
    edges = FI.load_pickle('.\\temp_res\\edge.pkl')
    nodes = list(nodes.values())
    edges = list(edges.values())
    nodes_mail = [x['mail'] for x in nodes]
    # edges = [tuple(x) for x in edges]
    G = nx.Graph()
    G.add_nodes_from(nodes_mail)
    G.add_weighted_edges_from(edges)
    nx.write_gexf(G,'.\\temp_res\\data.gexf')
def generate_data_for_gelphi():
    generate_node_and_edge()
    nodes = FI.load_pickle('.\\temp_res\\node.pkl')
    edges = FI.load_pickle('.\\temp_res\\edge.pkl')
    nodes = list(nodes.values())
    edges = list(edges.values())
    nodes_mail = [x['mail'] for x in nodes]
    # edges = [tuple(x) for x in edges]
    G = nx.Graph()
    G.add_nodes_from(nodes_mail)
    G.add_weighted_edges_from(edges)
    nx.write_gexf(G, '.\\temp_res\\data.gexf')
Example #5
0
 def Load_Word_Freq(self,word_freq_path):
     # load the info of word frequence
     # will generate a word dict
     if self.word_dict is not None:
         raise RuntimeError('the word dict is not empty')
     word_freq = FI.load_pickle(word_freq_path)
     self.__Gnerate_Word_Dict(word_freq)
Example #6
0
 def Load_Word_Freq(self, word_freq_path):
     # 加载词频数据
     # 加载词频后输出词典
     if self.word_dict is not None:
         raise RuntimeError('the word dict is not empty')
     word_freq = FI.load_pickle(word_freq_path)
     self.__Gnerate_Word_Dict(word_freq)
Example #7
0
 def Load_Word_Freq(self, word_freq_path):
     # load the info of word frequence
     # will generate a word dict
     if self.word_dict is not None:
         raise RuntimeError('the word dict is not empty')
     word_freq = FI.load_pickle(word_freq_path)
     self.__Gnerate_Word_Dict(word_freq)
Example #8
0
 def Import_Model(self,model_path):
     model = FI.load_pickle(model_path)  # a dict, {'word_dict','huffman','vec_len'}
     self.word_dict = model.word_dict
     self.huffman = model.huffman
     self.vec_len = model.vec_len
     self.learn_rate = model.learn_rate
     self.win_len = model.win_len
     self.model = model.model
Example #9
0
 def Import_Model(self,model_path):
     model = FI.load_pickle(model_path)  # a dict, {'word_dict','huffman','vec_len'}
     self.word_dict = model.word_dict
     self.huffman = model.huffman
     self.vec_len = model.vec_len
     self.learn_rate = model.learn_rate
     self.win_len = model.win_len
     self.model = model.model
Example #10
0
 def Import_Model(self,model_path):    #直接读入word2vec模型
     model = FI.load_pickle(model_path)  # 以字典形式储存, {'word_dict','huffman','vec_len'}
     self.word_dict = model['word_dict']
     self.huffman = model['huffman']
     self.vec_len = model['vec_len']
     self.learn_rate = model['learn_rate']
     self.win_len = model['win_len']
     self.model = model['model']
Example #11
0
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    # text = FI.load_pickle('./static/demo.pkl')
    # text =[ x['dealed_text']['left_content'][0] for x in text]
    # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    # wv = Word2Vec(vec_len=500)
    # wv.Train_Model(text)
    # FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    x = FI.load_pickle('./static/normal_wv.pkl')

    def cal_simi(data, key1, key2):
        return data[key1].dot(data[key2].T)[0][0]

    keys = list(x.keys())
    for key in keys:
        print(key, '\t', cal_simi(x, '小说', key))
Example #12
0
        else:
            return temp[:high]


if __name__ == '__main__':
    # text = FI.load_pickle('./static/demo.pkl')
    # text =[ x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    # 输入文本
    data = [
        "小说”一词最早出现于《庄子·外物》:「饰小说以干县令,其于大达亦远矣。」庄子所谓的「小说」,是指琐碎的言论,与今日小说观念相差甚远。直至东汉桓谭《新论》:「小说家合残丛小语,近取譬喻,以作短书,治身理家,有可观之辞。」班固《汉书.艺文志》将「小说家」列为十家之后,其下的定义为:「小说家者流,盖出于稗官,街谈巷语,道听途说[4]之所造也。」才稍与今日小说的意义相近。而中国小说最大的特色,便自宋代开始具有文言小说与白话小说两种不同的小说系统。文言小说起源于先秦的街谈巷语,是一种小知小道的纪录。在历经魏晋南北朝及隋唐长期的发展,无论是题材或人物的描写,文言小说都有明显的进步,形成笔记与传奇两种小说类型。而白话小说则起源于唐宋时期说话人的话本,故事的取材来自民间,主要表现了百姓的生活及思想意识。但不管文言小说或白话小说都源远流长,呈现各自不同的艺术特色。"
    ]
    wv = Word2Vec(vec_len=500)  # vec 长度为500
    wv.Train_Model(data)

    FI.save_pickle(wv.word_dict, './static/wv.pkl')

    # 保存以后归一化
    data = FI.load_pickle('./static/wv.pkl')
    x = {}
    for key in data:
        temp = data[key]['vector']
        temp = preprocessing.normalize(temp)
        x[key] = temp
    FI.save_pickle(x, './static/normal_wv.pkl')

    x = FI.load_pickle('./static/normal_wv.pkl')

    # 计算两个词之间的相似度
    def cal_simi(data, key1, key2):
        return data[key1].dot(data[key2].T)[0][0]
Example #13
0
# 从wikileaks下载邮件内容并生成结构化数据
# ===============   ATTENTION   =============== :
# 由于邮件格式较繁杂,因此无法全部解析(5%左右)
base_url = 'https://wikileaks.org/dnc-emails/get/{page}'
base_path = '.\static\{page}.pkl'
gotten_id = os.listdir('.\static')
gotten_id = [int((x.split('.'))[0]) for x in gotten_id]
task_pool = list(range(1,2000))  # wikileak中的邮件编号
while True :
    if task_pool.__len__()==0:
        break
    task_id = task_pool.pop(0)
    if task_id in gotten_id:
        print('{id} skip'.format(id=task_id))
        continue
    url = base_url.format(page = task_id)
    path = base_path.format(page = task_id)
    try:
        info = getStructedData(url)
        FI.save_pickle(info,path)
        print('{t} succeed'.format(t=task_id))
    except Exception as e:
        # task_pool.append(task_id)
        print('{t} failed <--<--<--<--'.format(t=task_id))
        print(e)
print('文件已下载完毕')

# 生成社交网络数据(需要networkx包)
generate_data_for_gelphi()
print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))
Example #14
0
        if ret=='dict':
            ret_data = {}
            for ele,count in temp[:high]:
                ret_data[ele]=count
            return ret_data
        else:
            return temp[:high]

if __name__ == '__main__':
    """
    stop_words = open(u"./static/中文停用词表(比较全面,有1208个停用词).txt")
    list1 = []
    for line in stop_words:
        list1.append(line.strip())
    print len(list1)
    pickle.dump(list1,open("./static/stop_words.pkl","wb"),protocol = 2)
    """
    text = FI.load_pickle("./static/stop_words.pkl") #list
    for x in text:
        print x.decode("gbk")
        break

    data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wc = WordCounter(data)
    print(wc.count_res.larger_than(16))
    """
    c=MulCounter('abcdeabcdaffbcabag')
    print(sorted(c.items(),key=_itemgetter(1),reverse=True))#operator.itemgetter(1)
    print(c.larger_than(3))
    """
    pass
Example #15
0
__author__ = 'multiangle'

import File_Interface as FI
import json
import os
import time

paths = os.listdir('.\static')
date_counter = {}
for path in paths:
    data = FI.load_pickle('.\static\{x}'.format(x=path))
    date_ori = data.get('Date')
    if date_ori:
        tag = time.strftime('%Y/%m/%d', date_ori)
        c = date_counter.get(tag)
        print(tag)
        if c:
            date_counter[tag]['count'] += 1
        else:
            date_counter[tag] = dict(date=tag, count=1)
# date_counter = list(date_counter.values())
# date_counter = sorted(date_counter,key=lambda x:x['date'])
# for date in date_counter:
#     print('{a}\t{b}'.format(a=date['date'],b=date['count']))
Example #16
0
    def __GoAlong_Huffman(self, word_huffman, input_vector, root):
        node = root
        e = np.zeros([1, self.vec_len])
        for level in range(word_huffman.__len__()):
            huffman_charat = word_huffman[level]
            q = self.__Sigmoid(input_vector.dot(node.value.T))
            grad = self.learn_rate * (1 - int(huffman_charat) - q)
            e += grad * node.value
            node.value += grad * input_vector
            node.value = preprocessing.normalize(node.value)
            if huffman_charat == '0':
                node = node.right
            else:
                node = node.left
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    text = FI.load_pickle('./static/stop_words.pkl')
    text = [x for x in text]
    data = [
        'Merge multiple sorted inputs into a single sorted output',
        'The API below differs from textbook heap algorithms in two aspects'
    ]
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict, './static/wv.pkl')
    pass
Example #17
0
#                 pre_node = SNA_node.build_Head_Node(item.data['retweeted_status'])
#                 pre_node.add_Next(item)
#                 info_dict[pre_node.id] = pre_node
#                 info_id_list.append(pre_node.id)
#                 head_id_list.append(pre_node.id)
#                 head_id_list.remove(id)
#     print(ite_times)
#     ite_times += 1
#     print(head_id_list.__len__())
# FI.save_pickle(info_dict,'./static/dealed_info_list.pkl')

# # 统计
# data = FI.load_pickle('./static/dealed_info_list.pkl')
# id_list = list(data.keys())
# for id in id_list:
#     data[id].analyse()
# FI.save_pickle(data,'./static/analyse_info_list.pkl')

data = FI.load_pickle('./static/analyse_info_list.pkl')
id_list = list(data.keys())
for id in id_list:
    if data[id].result['retweet_num']>1 :
        print(data[id])







Example #18
0
            break
        if list[mid]['time'] < value :
            low = mid + 1
        else:
            high = mid - 1
    if i<0:
        i = low
    if i==list.__len__() :
        list.append({'time':value,'freq':1})
    elif list[i]['time']==value :
        list[i]['freq'] += 1
    else:
        list.insert(i,{'time':value,'freq':1})

if __name__=='__main__':
    stop_words = FI.load_pickle('./static/stop_words.pkl')
    wfts = FI.load_pickle('./static/wfts_continue.pkl')
    client = MongoClient('localhost',27017)
    db = client['microblog_spider']
    latest_history = db.latest_history
    count = 0
    data = []
    batch_size = 100
    gone_size = 40000

    while count<100 :
        t1 = time.time()
        data = latest_history.find().skip(gone_size+count*batch_size).limit(batch_size)
        data = [x for x in data]
        t2 = time.time()
        count += 1
def generate_node_and_edge():
    pkl_list = os.listdir('.\static')
    node_list = {}
    edge_list = {}
    for path in pkl_list:
        info = FI.load_pickle('.\static\{x}'.format(x=path))
        # print(info)

        # 处理 From情况
        try:
            sender_mail = info['From']['mail']
            ori_mail = sender_mail
            sender_mail = sender_mail.lower()
        except:
            continue # 如果其中没有From 则该邮件作废
        sender_node = node_list.get(sender_mail)
        if sender_node:
            sender_node['send'] += 1
        else:
            node_list[sender_mail] = dict(
                mail = sender_mail,
                send = 1,
                receive = 0,
                cc = 0,
                ori_mail = ori_mail
            )
            if info.get('name'):
                node_list[sender_mail]['name'] = info['name']

        # 处理To情况
        receiver_list = info.get('To')
        if not receiver_list:
            continue
        for receiver in receiver_list:
            mail = receiver['mail']
            ori_mail = mail
            mail = mail.lower()
            node = node_list.get(mail)
            if node:
                node['receive'] += 1
            else:
                node_list[mail] = dict(
                    mail = mail,
                    send = 0 ,
                    receive = 1,
                    cc = 0,
                    ori_mail = ori_mail
                )
                if receiver.get('name'):
                    node_list[mail]['name'] = receiver['name']
            edge_key = sender_mail + '|----->' + mail
            edge = edge_list.get(edge_key)
            if edge:
                edge[2] += 1 # 如果该条边已经存在
            else:
                edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边

        # 处理CC的情况
        cc_list = info.get('CC')
        if cc_list:
            for cc in cc_list:
                mail = cc['mail']
                ori_mail = mail
                mail = mail.lower()
                node = node_list.get(mail)
                if node:
                    node['cc'] += 1
                else:
                    node_list[mail] = dict(
                        mail = mail,
                        send = 0 ,
                        receive = 0 ,
                        cc = 1,
                        ori_mail = ori_mail
                    )
                    if cc.get('name'):
                        node_list[mail]['name'] = cc['name']
                edge_key = sender_mail + '|----->' + mail
                edge = edge_list.get(edge_key)
                if edge:
                    edge[2] += 1 # 如果该条边已经存在
                else:
                    edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边
        print('{id} is dealed'.format(id=path))
    FI.save_pickle(node_list,'.\\temp_res\\node.pkl')
    FI.save_pickle(edge_list,'.\\temp_res\\edge.pkl')
Example #20
0
__author__ = 'multiangle'

import jieba
import File_Interface as FI

data = FI.load_pickle('demo.pkl')
user_list = [x['user_name'] for x in data]
text_list = [x['dealed_text']['left_content'] for x in data]

for line in text_list:
    print(line)
    res = jieba.cut(line[0], cut_all=False)
    # print(list(seg_list))
    res = list(res)
    print(res)
Example #21
0
        if list[mid]['time'] < value:
            low = mid + 1
        else:
            high = mid - 1
    if i < 0:
        i = low
    if i == list.__len__():
        list.append({'time': value, 'freq': 1})
    elif list[i]['time'] == value:
        list[i]['freq'] += 1
    else:
        list.insert(i, {'time': value, 'freq': 1})


if __name__ == '__main__':
    stop_words = FI.load_pickle('./static/stop_words.pkl')
    wfts = FI.load_pickle('./static/wfts_continue.pkl')
    client = MongoClient('localhost', 27017)
    db = client['microblog_spider']
    latest_history = db.latest_history
    count = 0
    data = []
    batch_size = 100
    gone_size = 40000

    while count < 100:
        t1 = time.time()
        data = latest_history.find().skip(gone_size +
                                          count * batch_size).limit(batch_size)
        data = [x for x in data]
        t2 = time.time()
Example #22
0
                high = mid
        if temp[low][1]>maxvalue:
            if ret=='dict':
                return {}
            else:
                return []
        if ret=='dict':
            ret_data = {}
            for ele,count in temp[:high]:
                ret_data[ele]=count
            return ret_data
        else:
            return temp[:high]

if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text =[ x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
Example #23
0
                if sort_type == 'up':
                    res = [
                        pop_id(x)
                        for x in collection.find(select, f).limit(limit).sort(
                            sort, pymongo.ASCENDING)
                    ]
                else:
                    res = [
                        pop_id(x)
                        for x in collection.find(select, f).limit(limit).sort(
                            sort, pymongo.DESCENDING)
                    ]
    return res


def pop_id(data):
    data.pop('_id')
    return data


res = read_content_in_mongo(
    'latest_history', {'user_id': '1681029540'},
    ['dealed_text.left_content', 'created_at', 'user_name'], -1, 'id', 'down')

data = FI.load_pickle('demo.pkl')
data = data + res
FI.save_pickle(data, 'demo.pkl')
for line in res:
    print(line)
print(res.__len__())
Example #24
0
        if temp[low][1] > maxvalue:
            if ret == 'dict':
                return {}
            else:
                return []
        if ret == 'dict':
            ret_data = {}
            for ele, count in temp[:high]:
                ret_data[ele] = count
            return ret_data
        else:
            return temp[:high]


if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text = [x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict, './static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
Example #25
0
        for level in range(word_huffman.__len__()):
            huffman_charat = word_huffman[level]
            q = self.__Sigmoid(input_vector.dot(node.value.T))
            grad = self.learn_rate * (1 - int(huffman_charat) - q)
            e += grad * node.value
            node.value += grad * input_vector
            node.value = preprocessing.normalize(node.value)
            if huffman_charat == '0':
                node = node.right
            else:
                node = node.left
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    import WordCount

    # text = WordCount.readfile('./static/text8_mini')
    # text = WordCount.readfile('./static/text8')
    # mv = Word2Vec(vec_len=500)
    # mv.Train_Model(text)
    # FI.save_picle(mv.word_dict, './model/model_h.pkl')

    model_h = FI.load_pickle('./model/model_h.pkl')
    keys = list(model_h.keys())
    print(keys.__len__())
    print(keys[0])
Example #26
0
 def Get_Stop_Words(self):
     ret = []
     ret = FI.load_pickle('./static/stop_words.pkl')
     return ret
def generate_node_and_edge():
    pkl_list = os.listdir('.\static')
    node_list = {}
    edge_list = {}
    for path in pkl_list:
        info = FI.load_pickle('.\static\{x}'.format(x=path))
        # print(info)

        # 处理 From情况
        try:
            sender_mail = info['From']['mail']
            ori_mail = sender_mail
            sender_mail = sender_mail.lower()
        except:
            continue  # 如果其中没有From 则该邮件作废
        sender_node = node_list.get(sender_mail)
        if sender_node:
            sender_node['send'] += 1
        else:
            node_list[sender_mail] = dict(mail=sender_mail,
                                          send=1,
                                          receive=0,
                                          cc=0,
                                          ori_mail=ori_mail)
            if info.get('name'):
                node_list[sender_mail]['name'] = info['name']

        # 处理To情况
        receiver_list = info.get('To')
        if not receiver_list:
            continue
        for receiver in receiver_list:
            mail = receiver['mail']
            ori_mail = mail
            mail = mail.lower()
            node = node_list.get(mail)
            if node:
                node['receive'] += 1
            else:
                node_list[mail] = dict(mail=mail,
                                       send=0,
                                       receive=1,
                                       cc=0,
                                       ori_mail=ori_mail)
                if receiver.get('name'):
                    node_list[mail]['name'] = receiver['name']
            edge_key = sender_mail + '|----->' + mail
            edge = edge_list.get(edge_key)
            if edge:
                edge[2] += 1  # 如果该条边已经存在
            else:
                edge_list[edge_key] = [sender_mail, mail, 1]  #如果该条边不存在,则新加一条边

        # 处理CC的情况
        cc_list = info.get('CC')
        if cc_list:
            for cc in cc_list:
                mail = cc['mail']
                ori_mail = mail
                mail = mail.lower()
                node = node_list.get(mail)
                if node:
                    node['cc'] += 1
                else:
                    node_list[mail] = dict(mail=mail,
                                           send=0,
                                           receive=0,
                                           cc=1,
                                           ori_mail=ori_mail)
                    if cc.get('name'):
                        node_list[mail]['name'] = cc['name']
                edge_key = sender_mail + '|----->' + mail
                edge = edge_list.get(edge_key)
                if edge:
                    edge[2] += 1  # 如果该条边已经存在
                else:
                    edge_list[edge_key] = [sender_mail, mail,
                                           1]  #如果该条边不存在,则新加一条边
        print('{id} is dealed'.format(id=path))
    FI.save_pickle(node_list, '.\\temp_res\\node.pkl')
    FI.save_pickle(edge_list, '.\\temp_res\\edge.pkl')
Example #28
0
            else:
                node = node.left
        return e

    def __Sigmoid(self,value):
        return 1/(1+math.exp(-value))

if __name__ == '__main__':
    # text = FI.load_pickle('./static/demo.pkl')
    # text =[ x['dealed_text']['left_content'][0] for x in text]
    # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    # wv = Word2Vec(vec_len=500)
    # wv.Train_Model(text)
    # FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    x = FI.load_pickle('./static/normal_wv.pkl')
    def cal_simi(data,key1,key2):
        return data[key1].dot(data[key2].T)[0][0]
    keys=list(x.keys())
    for key in keys:
        print(key,'\t',cal_simi(x,'姚明',key))

__author__ = 'multiangle'

import jieba
import File_Interface as FI

data = FI.load_pickle('demo.pkl')
user_list = [x['user_name'] for x in data]
text_list = [x['dealed_text']['left_content'] for x in data]

for line in text_list:
    print(line)
    res = jieba.cut(line[0],cut_all=False)
    # print(list(seg_list))
    res = list(res)
    print(res)


Example #30
0
#         item = info_dict[id]
#         if item.hasPre:
#             changed = True
#             if item.data['retweeted_status']['id'] in info_id_list:
#                 info_dict[item.data['retweeted_status']['id']].add_Next(item)
#                 head_id_list.remove(id)
#             else:
#                 pre_node = SNA_node.build_Head_Node(item.data['retweeted_status'])
#                 pre_node.add_Next(item)
#                 info_dict[pre_node.id] = pre_node
#                 info_id_list.append(pre_node.id)
#                 head_id_list.append(pre_node.id)
#                 head_id_list.remove(id)
#     print(ite_times)
#     ite_times += 1
#     print(head_id_list.__len__())
# FI.save_pickle(info_dict,'./static/dealed_info_list.pkl')

# # 统计
# data = FI.load_pickle('./static/dealed_info_list.pkl')
# id_list = list(data.keys())
# for id in id_list:
#     data[id].analyse()
# FI.save_pickle(data,'./static/analyse_info_list.pkl')

data = FI.load_pickle('./static/analyse_info_list.pkl')
id_list = list(data.keys())
for id in id_list:
    if data[id].result['retweet_num'] > 1:
        print(data[id])
    def post(self):

        try:
            user_basic_info = self.get_argument('user_basic_info')
            attends = self.get_argument('user_attends')
            user_basic_info = eval(user_basic_info)
            attends = eval(attends)
            self.write('success to return user info')
            self.finish()
        except:
            self.write('fail to return user info')
            self.finish()
            return

        try:
            dbi = MySQL_Interface()
        except:
            print('unable to connect to MySql DB')

        try:
            if attends.__len__() > 0:  #store attends info
                table_name = 'cache_attends'
                attends_col_info = dbi.get_col_name(table_name)
                keys = attends[0].keys()
                attends = [[
                    line[i] if i in keys else '' for i in attends_col_info
                ] for line in attends]
                fans_col_pos = attends_col_info.index('fans_num')
                insert_attends = []
                for line in attends:
                    if line[fans_col_pos] > 1000:
                        insert_attends.append(line)
                dbi.insert_asList(table_name, insert_attends, unique=True)
                print('Success : attends of {uid} is stored in {tname}'.format(
                    uid=user_basic_info['uid'], tname=table_name))
            else:
                pass
        except Exception as e:
            print(e)
            path = "temp" + os.sep + "{uid}_attends.pkl".format(
                uid=user_basic_info['uid'])
            print(
                'unable to store attends of {uid}, it will be stored '.format(
                    uid=user_basic_info['uid']))
            FI.save_pickle(attends, path)

        try:
            atten_num_real = user_basic_info['attends_num']
            atten_num_get = attends.__len__()
            user_basic_info['accuracy'] = atten_num_get  # 实际获取到的关注数目
            col_info = dbi.get_col_name(
                'cache_user_info')  # store user basic info
            keys = user_basic_info.keys()
            data = [user_basic_info[i] if i in keys else '' for i in col_info]
            dbi.insert_asList('cache_user_info', [data], unique=True)
            print('Success : basic info of {uid} is stored in cache_user_info'.
                  format(uid=user_basic_info['uid']))
        except Exception as e:
            print(e)
            path = 'temp' + os.sep + '{uid}_basic_info.pkl'.format(
                uid=user_basic_info['uid'])
            print('unable to store basic info of {uid} , it will be stored'.
                  format(uid=user_basic_info['uid']))
            FI.save_pickle(user_basic_info, path)

        try:
            if attends.__len__() > 0:  # store atten connection web
                from_uid = user_basic_info['uid']
                from_fans_num = user_basic_info['fans_num']
                from_blog_num = user_basic_info['blog_num']
                data = [[
                    from_uid, from_fans_num, from_blog_num,
                    str(x[attends_col_info.index('uid')]),
                    str(x[attends_col_info.index('fans_num')]),
                    str(x[attends_col_info.index('blog_num')])
                ] for x in attends]
                dbi.insert_asList('cache_atten_web', data)
                print(
                    'Success : conn web of {uid} is stored in cache_atten_web'.
                    format(uid=user_basic_info['uid']))
            else:
                pass
        except Exception as e:
            print(e)
            path = '{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store atten web of {uid} , it will be stored'.
                  format(uid=user_basic_info['uid']))
            FI.save_pickle(data, path)
    def post(self):

        try:
            user_basic_info=self.get_argument('user_basic_info')
            attends=self.get_argument('user_attends')
            user_basic_info=eval(user_basic_info)
            attends=eval(attends)
            self.write('success to return user info')
            self.finish()
        except:
            self.write('fail to return user info')
            self.finish()
            return

        try:
            dbi=MySQL_Interface()
        except:
            print('unable to connect to MySql DB')

        try:
            if attends.__len__()>0:           #store attends info
                table_name='cache_attends'
                attends_col_info=dbi.get_col_name(table_name)
                keys=attends[0].keys()
                attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends]
                fans_col_pos=attends_col_info.index('fans_num')
                insert_attends=[]
                for line in attends:
                    if line[fans_col_pos]>1000:
                        insert_attends.append(line)
                dbi.insert_asList(table_name,insert_attends,unique=True)
                print('Success : attends of {uid} is stored in {tname}'
                      .format(uid=user_basic_info['uid'],tname=table_name))
            else:
                pass
        except Exception as e:
            print(e)
            path="temp\\{uid}_attends.pkl".format(uid=user_basic_info['uid'])
            print('unable to store attends of {uid}, it will be stored '
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(attends,path)

        try:
            atten_num_real=user_basic_info['attends_num']
            atten_num_get=attends.__len__()
            user_basic_info['accuracy']=atten_num_get       # 实际获取到的关注数目
            col_info=dbi.get_col_name('cache_user_info')    # store user basic info
            keys=user_basic_info.keys()
            data=[user_basic_info[i] if i in keys else '' for i in col_info]
            dbi.insert_asList('cache_user_info',[data],unique=True)
            print('Success : basic info of {uid} is stored in cache_user_info'
                  .format(uid=user_basic_info['uid']))
        except Exception as e:
            print(e)
            path='temp\\{uid}_basic_info.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store basic info of {uid} , it will be stored'
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(user_basic_info,path)

        try:
            if attends.__len__()>0:            # store atten connection web
                from_uid=user_basic_info['uid']
                from_fans_num=user_basic_info['fans_num']
                from_blog_num=user_basic_info['blog_num']
                data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends]
                dbi.insert_asList('cache_atten_web',data)
                print('Success : conn web of {uid} is stored in cache_atten_web'
                      .format(uid=user_basic_info['uid']))
            else:
                pass
        except Exception as e:
            print(e)
            path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store atten web of {uid} , it will be stored'
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(data,path)
                    res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)]
                else:
                    res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)]
        else:
            f={}
            for item in field:
                f[item]=1

            if sort=='':
                res=[pop_id(x) for x in collection.find(select,f).limit(limit)]
            else:
                if sort_type=='up':
                    res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)]
                else:
                    res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)]
    return res

def pop_id(data):
    data.pop('_id')
    return data

res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down')

data = FI.load_pickle('demo.pkl')
data = data + res
FI.save_pickle(data,'demo.pkl')
for line in res:
    print(line)
print(res.__len__())

Example #34
0
 def Get_Stop_Words(self):
     ret = []
     ret = FI.load_pickle('./static/stop_words.pkl')
     return ret
Example #35
0
# 从wikileaks下载邮件内容并生成结构化数据
# ===============   ATTENTION   =============== :
# 由于邮件格式较繁杂,因此无法全部解析(5%左右)
base_url = 'https://wikileaks.org/dnc-emails/get/{page}'
base_path = '.\static\{page}.pkl'
gotten_id = os.listdir('.\static')
gotten_id = [int((x.split('.'))[0]) for x in gotten_id]
task_pool = list(range(1, 2000))  # wikileak中的邮件编号
while True:
    if task_pool.__len__() == 0:
        break
    task_id = task_pool.pop(0)
    if task_id in gotten_id:
        print('{id} skip'.format(id=task_id))
        continue
    url = base_url.format(page=task_id)
    path = base_path.format(page=task_id)
    try:
        info = getStructedData(url)
        FI.save_pickle(info, path)
        print('{t} succeed'.format(t=task_id))
    except Exception as e:
        # task_pool.append(task_id)
        print('{t} failed <--<--<--<--'.format(t=task_id))
        print(e)
print('文件已下载完毕')

# 生成社交网络数据(需要networkx包)
generate_data_for_gelphi()
print('gexf文件已经生成,存放路径: {path}\\temp_res'.format(path=os.getcwd()))