def generate_data_for_gelphi():
    generate_node_and_edge()
    nodes = FI.load_pickle('.\\temp_res\\node.pkl')
    edges = FI.load_pickle('.\\temp_res\\edge.pkl')
    nodes = list(nodes.values())
    edges = list(edges.values())
    nodes_mail = [x['mail'] for x in nodes]
    # edges = [tuple(x) for x in edges]
    G = nx.Graph()
    G.add_nodes_from(nodes_mail)
    G.add_weighted_edges_from(edges)
    nx.write_gexf(G,'.\\temp_res\\data.gexf')
def generate_data_for_gelphi():
    generate_node_and_edge()
    nodes = FI.load_pickle('.\\temp_res\\node.pkl')
    edges = FI.load_pickle('.\\temp_res\\edge.pkl')
    nodes = list(nodes.values())
    edges = list(edges.values())
    nodes_mail = [x['mail'] for x in nodes]
    # edges = [tuple(x) for x in edges]
    G = nx.Graph()
    G.add_nodes_from(nodes_mail)
    G.add_weighted_edges_from(edges)
    nx.write_gexf(G, '.\\temp_res\\data.gexf')
 def Load_Word_Freq(self, word_freq_path):
     # load the info of word frequence
     # will generate a word dict
     if self.word_dict is not None:
         raise RuntimeError('the word dict is not empty')
     word_freq = FI.load_pickle(word_freq_path)
     self.__Gnerate_Word_Dict(word_freq)
Beispiel #4
0
 def Load_Word_Freq(self,word_freq_path):
     # load the info of word frequence
     # will generate a word dict
     if self.word_dict is not None:
         raise RuntimeError('the word dict is not empty')
     word_freq = FI.load_pickle(word_freq_path)
     self.__Gnerate_Word_Dict(word_freq)
Beispiel #5
0
 def Load_Word_Freq(self, word_freq_path):
     # 加载词频数据
     # 加载词频后输出词典
     if self.word_dict is not None:
         raise RuntimeError('the word dict is not empty')
     word_freq = FI.load_pickle(word_freq_path)
     self.__Gnerate_Word_Dict(word_freq)
Beispiel #6
0
 def Import_Model(self,model_path):
     model = FI.load_pickle(model_path)  # a dict, {'word_dict','huffman','vec_len'}
     self.word_dict = model.word_dict
     self.huffman = model.huffman
     self.vec_len = model.vec_len
     self.learn_rate = model.learn_rate
     self.win_len = model.win_len
     self.model = model.model
Beispiel #7
0
 def Import_Model(self,model_path):
     model = FI.load_pickle(model_path)  # a dict, {'word_dict','huffman','vec_len'}
     self.word_dict = model.word_dict
     self.huffman = model.huffman
     self.vec_len = model.vec_len
     self.learn_rate = model.learn_rate
     self.win_len = model.win_len
     self.model = model.model
Beispiel #8
0
 def Import_Model(self,model_path):    #直接读入word2vec模型
     model = FI.load_pickle(model_path)  # 以字典形式储存, {'word_dict','huffman','vec_len'}
     self.word_dict = model['word_dict']
     self.huffman = model['huffman']
     self.vec_len = model['vec_len']
     self.learn_rate = model['learn_rate']
     self.win_len = model['win_len']
     self.model = model['model']
Beispiel #9
0
            grad = self.learn_rate * (1 - int(huffman_charat) - q)
            e += grad * node.value
            node.value += grad * input_vector
            node.value = preprocessing.normalize(node.value)
            if huffman_charat == '0':
                node = node.right
            else:
                node = node.left
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text = [x['dealed_text']['left_content'][0] for x in text]
    #data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict, './static/wv.pkl')

    data = FI.load_pickle('./static/wv.pkl')
    x = {}
    for key in data:
        temp = data[key]['vector']
        temp = preprocessing.normalize(temp)
        x[key] = temp
    FI.save_pickle(x, './static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
    def __GoAlong_Huffman(self, word_huffman, input_vector, root):
        node = root
        e = np.zeros([1, self.vec_len])
        for level in range(word_huffman.__len__()):
            huffman_charat = word_huffman[level]
            q = self.__Sigmoid(input_vector.dot(node.value.T))
            grad = self.learn_rate * (1 - int(huffman_charat) - q)
            e += grad * node.value
            node.value += grad * input_vector
            node.value = preprocessing.normalize(node.value)
            if huffman_charat == '0':
                node = node.right
            else:
                node = node.left
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    text = FI.load_pickle('./static/stop_words.pkl')
    text = [x for x in text]
    data = [
        'Merge multiple sorted inputs into a single sorted output',
        'The API below differs from textbook heap algorithms in two aspects'
    ]
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict, './static/wv.pkl')
    pass
                    res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.ASCENDING)]
                else:
                    res=[pop_id(x) for x in collection.find(select).limit(limit).sort(sort,pymongo.DESCENDING)]
        else:
            f={}
            for item in field:
                f[item]=1

            if sort=='':
                res=[pop_id(x) for x in collection.find(select,f).limit(limit)]
            else:
                if sort_type=='up':
                    res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.ASCENDING)]
                else:
                    res=[pop_id(x) for x in collection.find(select,f).limit(limit).sort(sort,pymongo.DESCENDING)]
    return res

def pop_id(data):
    data.pop('_id')
    return data

res=read_content_in_mongo('latest_history',{'user_id':'1681029540'},['dealed_text.left_content','created_at','user_name'],-1,'id','down')

data = FI.load_pickle('demo.pkl')
data = data + res
FI.save_pickle(data,'demo.pkl')
for line in res:
    print(line)
print(res.__len__())

Beispiel #12
0
        for level in range(word_huffman.__len__()):
            huffman_charat = word_huffman[level]
            q = self.__Sigmoid(input_vector.dot(node.value.T))
            grad = self.learn_rate * (1 - int(huffman_charat) - q)
            e += grad * node.value
            node.value += grad * input_vector
            node.value = preprocessing.normalize(node.value)
            if huffman_charat == '0':
                node = node.right
            else:
                node = node.left
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    import WordCount

    # text = WordCount.readfile('./static/text8_mini')
    # text = WordCount.readfile('./static/text8')
    # mv = Word2Vec(vec_len=500)
    # mv.Train_Model(text)
    # FI.save_picle(mv.word_dict, './model/model_h.pkl')

    model_h = FI.load_pickle('./model/model_h.pkl')
    keys = list(model_h.keys())
    print(keys.__len__())
    print(keys[0])
Beispiel #13
0
#                 pre_node = SNA_node.build_Head_Node(item.data['retweeted_status'])
#                 pre_node.add_Next(item)
#                 info_dict[pre_node.id] = pre_node
#                 info_id_list.append(pre_node.id)
#                 head_id_list.append(pre_node.id)
#                 head_id_list.remove(id)
#     print(ite_times)
#     ite_times += 1
#     print(head_id_list.__len__())
# FI.save_pickle(info_dict,'./static/dealed_info_list.pkl')

# # 统计
# data = FI.load_pickle('./static/dealed_info_list.pkl')
# id_list = list(data.keys())
# for id in id_list:
#     data[id].analyse()
# FI.save_pickle(data,'./static/analyse_info_list.pkl')

data = FI.load_pickle('./static/analyse_info_list.pkl')
id_list = list(data.keys())
for id in id_list:
    if data[id].result['retweet_num']>1 :
        print(data[id])







Beispiel #14
0
        if ret=='dict':
            ret_data = {}
            for ele,count in temp[:high]:
                ret_data[ele]=count
            return ret_data
        else:
            return temp[:high]

if __name__ == '__main__':
    """
    stop_words = open(u"./static/中文停用词表(比较全面,有1208个停用词).txt")
    list1 = []
    for line in stop_words:
        list1.append(line.strip())
    print len(list1)
    pickle.dump(list1,open("./static/stop_words.pkl","wb"),protocol = 2)
    """
    text = FI.load_pickle("./static/stop_words.pkl") #list
    for x in text:
        print x.decode("gbk")
        break

    data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wc = WordCounter(data)
    print(wc.count_res.larger_than(16))
    """
    c=MulCounter('abcdeabcdaffbcabag')
    print(sorted(c.items(),key=_itemgetter(1),reverse=True))#operator.itemgetter(1)
    print(c.larger_than(3))
    """
    pass
def generate_node_and_edge():
    pkl_list = os.listdir('.\static')
    node_list = {}
    edge_list = {}
    for path in pkl_list:
        info = FI.load_pickle('.\static\{x}'.format(x=path))
        # print(info)

        # 处理 From情况
        try:
            sender_mail = info['From']['mail']
            ori_mail = sender_mail
            sender_mail = sender_mail.lower()
        except:
            continue # 如果其中没有From 则该邮件作废
        sender_node = node_list.get(sender_mail)
        if sender_node:
            sender_node['send'] += 1
        else:
            node_list[sender_mail] = dict(
                mail = sender_mail,
                send = 1,
                receive = 0,
                cc = 0,
                ori_mail = ori_mail
            )
            if info.get('name'):
                node_list[sender_mail]['name'] = info['name']

        # 处理To情况
        receiver_list = info.get('To')
        if not receiver_list:
            continue
        for receiver in receiver_list:
            mail = receiver['mail']
            ori_mail = mail
            mail = mail.lower()
            node = node_list.get(mail)
            if node:
                node['receive'] += 1
            else:
                node_list[mail] = dict(
                    mail = mail,
                    send = 0 ,
                    receive = 1,
                    cc = 0,
                    ori_mail = ori_mail
                )
                if receiver.get('name'):
                    node_list[mail]['name'] = receiver['name']
            edge_key = sender_mail + '|----->' + mail
            edge = edge_list.get(edge_key)
            if edge:
                edge[2] += 1 # 如果该条边已经存在
            else:
                edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边

        # 处理CC的情况
        cc_list = info.get('CC')
        if cc_list:
            for cc in cc_list:
                mail = cc['mail']
                ori_mail = mail
                mail = mail.lower()
                node = node_list.get(mail)
                if node:
                    node['cc'] += 1
                else:
                    node_list[mail] = dict(
                        mail = mail,
                        send = 0 ,
                        receive = 0 ,
                        cc = 1,
                        ori_mail = ori_mail
                    )
                    if cc.get('name'):
                        node_list[mail]['name'] = cc['name']
                edge_key = sender_mail + '|----->' + mail
                edge = edge_list.get(edge_key)
                if edge:
                    edge[2] += 1 # 如果该条边已经存在
                else:
                    edge_list[edge_key] = [sender_mail,mail,1] #如果该条边不存在,则新加一条边
        print('{id} is dealed'.format(id=path))
    FI.save_pickle(node_list,'.\\temp_res\\node.pkl')
    FI.save_pickle(edge_list,'.\\temp_res\\edge.pkl')
Beispiel #16
0
        if list[mid]['time'] < value:
            low = mid + 1
        else:
            high = mid - 1
    if i < 0:
        i = low
    if i == list.__len__():
        list.append({'time': value, 'freq': 1})
    elif list[i]['time'] == value:
        list[i]['freq'] += 1
    else:
        list.insert(i, {'time': value, 'freq': 1})


if __name__ == '__main__':
    stop_words = FI.load_pickle('./static/stop_words.pkl')
    wfts = FI.load_pickle('./static/wfts_continue.pkl')
    client = MongoClient('localhost', 27017)
    db = client['microblog_spider']
    latest_history = db.latest_history
    count = 0
    data = []
    batch_size = 100
    gone_size = 40000

    while count < 100:
        t1 = time.time()
        data = latest_history.find().skip(gone_size +
                                          count * batch_size).limit(batch_size)
        data = [x for x in data]
        t2 = time.time()
Beispiel #17
0
        return e

    def __Sigmoid(self, value):
        return 1 / (1 + math.exp(-value))


if __name__ == '__main__':
    # text = FI.load_pickle('./static/demo.pkl')
    # text =[ x['dealed_text']['left_content'][0] for x in text]
    # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    # wv = Word2Vec(vec_len=500)
    # wv.Train_Model(text)
    # FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    x = FI.load_pickle('./static/normal_wv.pkl')

    def cal_simi(data, key1, key2):
        return data[key1].dot(data[key2].T)[0][0]

    keys = list(x.keys())
    for key in keys:
        print(key, '\t', cal_simi(x, '小说', key))
Beispiel #18
0
__author__ = 'multiangle'

import File_Interface as FI
import json
import os
import time

paths = os.listdir('.\static')
date_counter = {}
for path in paths:
    data = FI.load_pickle('.\static\{x}'.format(x=path))
    date_ori = data.get('Date')
    if date_ori:
        tag = time.strftime('%Y/%m/%d', date_ori)
        c = date_counter.get(tag)
        print(tag)
        if c:
            date_counter[tag]['count'] += 1
        else:
            date_counter[tag] = dict(date=tag, count=1)
# date_counter = list(date_counter.values())
# date_counter = sorted(date_counter,key=lambda x:x['date'])
# for date in date_counter:
#     print('{a}\t{b}'.format(a=date['date'],b=date['count']))
def generate_node_and_edge():
    pkl_list = os.listdir('.\static')
    node_list = {}
    edge_list = {}
    for path in pkl_list:
        info = FI.load_pickle('.\static\{x}'.format(x=path))
        # print(info)

        # 处理 From情况
        try:
            sender_mail = info['From']['mail']
            ori_mail = sender_mail
            sender_mail = sender_mail.lower()
        except:
            continue  # 如果其中没有From 则该邮件作废
        sender_node = node_list.get(sender_mail)
        if sender_node:
            sender_node['send'] += 1
        else:
            node_list[sender_mail] = dict(mail=sender_mail,
                                          send=1,
                                          receive=0,
                                          cc=0,
                                          ori_mail=ori_mail)
            if info.get('name'):
                node_list[sender_mail]['name'] = info['name']

        # 处理To情况
        receiver_list = info.get('To')
        if not receiver_list:
            continue
        for receiver in receiver_list:
            mail = receiver['mail']
            ori_mail = mail
            mail = mail.lower()
            node = node_list.get(mail)
            if node:
                node['receive'] += 1
            else:
                node_list[mail] = dict(mail=mail,
                                       send=0,
                                       receive=1,
                                       cc=0,
                                       ori_mail=ori_mail)
                if receiver.get('name'):
                    node_list[mail]['name'] = receiver['name']
            edge_key = sender_mail + '|----->' + mail
            edge = edge_list.get(edge_key)
            if edge:
                edge[2] += 1  # 如果该条边已经存在
            else:
                edge_list[edge_key] = [sender_mail, mail, 1]  #如果该条边不存在,则新加一条边

        # 处理CC的情况
        cc_list = info.get('CC')
        if cc_list:
            for cc in cc_list:
                mail = cc['mail']
                ori_mail = mail
                mail = mail.lower()
                node = node_list.get(mail)
                if node:
                    node['cc'] += 1
                else:
                    node_list[mail] = dict(mail=mail,
                                           send=0,
                                           receive=0,
                                           cc=1,
                                           ori_mail=ori_mail)
                    if cc.get('name'):
                        node_list[mail]['name'] = cc['name']
                edge_key = sender_mail + '|----->' + mail
                edge = edge_list.get(edge_key)
                if edge:
                    edge[2] += 1  # 如果该条边已经存在
                else:
                    edge_list[edge_key] = [sender_mail, mail,
                                           1]  #如果该条边不存在,则新加一条边
        print('{id} is dealed'.format(id=path))
    FI.save_pickle(node_list, '.\\temp_res\\node.pkl')
    FI.save_pickle(edge_list, '.\\temp_res\\edge.pkl')
        if temp[low][1] > maxvalue:
            if ret == 'dict':
                return {}
            else:
                return []
        if ret == 'dict':
            ret_data = {}
            for ele, count in temp[:high]:
                ret_data[ele] = count
            return ret_data
        else:
            return temp[:high]


if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text = [x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict, './static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
Beispiel #21
0
                if sort_type == 'up':
                    res = [
                        pop_id(x)
                        for x in collection.find(select, f).limit(limit).sort(
                            sort, pymongo.ASCENDING)
                    ]
                else:
                    res = [
                        pop_id(x)
                        for x in collection.find(select, f).limit(limit).sort(
                            sort, pymongo.DESCENDING)
                    ]
    return res


def pop_id(data):
    data.pop('_id')
    return data


res = read_content_in_mongo(
    'latest_history', {'user_id': '1681029540'},
    ['dealed_text.left_content', 'created_at', 'user_name'], -1, 'id', 'down')

data = FI.load_pickle('demo.pkl')
data = data + res
FI.save_pickle(data, 'demo.pkl')
for line in res:
    print(line)
print(res.__len__())
Beispiel #22
0
            else:
                node = node.left
        return e

    def __Sigmoid(self,value):
        return 1/(1+math.exp(-value))

if __name__ == '__main__':
    # text = FI.load_pickle('./static/demo.pkl')
    # text =[ x['dealed_text']['left_content'][0] for x in text]
    # # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    # wv = Word2Vec(vec_len=500)
    # wv.Train_Model(text)
    # FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    x = FI.load_pickle('./static/normal_wv.pkl')
    def cal_simi(data,key1,key2):
        return data[key1].dot(data[key2].T)[0][0]
    keys=list(x.keys())
    for key in keys:
        print(key,'\t',cal_simi(x,'姚明',key))

                high = mid
        if temp[low][1]>maxvalue:
            if ret=='dict':
                return {}
            else:
                return []
        if ret=='dict':
            ret_data = {}
            for ele,count in temp[:high]:
                ret_data[ele]=count
            return ret_data
        else:
            return temp[:high]

if __name__ == '__main__':
    text = FI.load_pickle('./static/demo.pkl')
    text =[ x['dealed_text']['left_content'][0] for x in text]
    # data = ['Merge multiple sorted inputs into a single sorted output','The API below differs from textbook heap algorithms in two aspects']
    wv = Word2Vec(vec_len=500)
    wv.Train_Model(text)
    FI.save_pickle(wv.word_dict,'./static/wv.pkl')
    #
    # data = FI.load_pickle('./static/wv.pkl')
    # x = {}
    # for key in data:
    #     temp = data[key]['vector']
    #     temp = preprocessing.normalize(temp)
    #     x[key] = temp
    # FI.save_pickle(x,'./static/normal_wv.pkl')

    # x = FI.load_pickle('./static/normal_wv.pkl')
 def Get_Stop_Words(self):
     ret = []
     ret = FI.load_pickle('./static/stop_words.pkl')
     return ret
Beispiel #25
0
#         item = info_dict[id]
#         if item.hasPre:
#             changed = True
#             if item.data['retweeted_status']['id'] in info_id_list:
#                 info_dict[item.data['retweeted_status']['id']].add_Next(item)
#                 head_id_list.remove(id)
#             else:
#                 pre_node = SNA_node.build_Head_Node(item.data['retweeted_status'])
#                 pre_node.add_Next(item)
#                 info_dict[pre_node.id] = pre_node
#                 info_id_list.append(pre_node.id)
#                 head_id_list.append(pre_node.id)
#                 head_id_list.remove(id)
#     print(ite_times)
#     ite_times += 1
#     print(head_id_list.__len__())
# FI.save_pickle(info_dict,'./static/dealed_info_list.pkl')

# # 统计
# data = FI.load_pickle('./static/dealed_info_list.pkl')
# id_list = list(data.keys())
# for id in id_list:
#     data[id].analyse()
# FI.save_pickle(data,'./static/analyse_info_list.pkl')

data = FI.load_pickle('./static/analyse_info_list.pkl')
id_list = list(data.keys())
for id in id_list:
    if data[id].result['retweet_num'] > 1:
        print(data[id])
Beispiel #26
0
 def Get_Stop_Words(self):
     ret = []
     ret = FI.load_pickle('./static/stop_words.pkl')
     return ret
            break
        if list[mid]['time'] < value :
            low = mid + 1
        else:
            high = mid - 1
    if i<0:
        i = low
    if i==list.__len__() :
        list.append({'time':value,'freq':1})
    elif list[i]['time']==value :
        list[i]['freq'] += 1
    else:
        list.insert(i,{'time':value,'freq':1})

if __name__=='__main__':
    stop_words = FI.load_pickle('./static/stop_words.pkl')
    wfts = FI.load_pickle('./static/wfts_continue.pkl')
    client = MongoClient('localhost',27017)
    db = client['microblog_spider']
    latest_history = db.latest_history
    count = 0
    data = []
    batch_size = 100
    gone_size = 40000

    while count<100 :
        t1 = time.time()
        data = latest_history.find().skip(gone_size+count*batch_size).limit(batch_size)
        data = [x for x in data]
        t2 = time.time()
        count += 1