def post(self):

        # 从客户端获取信息
        try:
            latest_time=self.get_argument('latest_time')
            latest_timestamp=self.get_argument('latest_timestamp')
            container_id=self.get_argument('container_id')
            self.write('success')
            self.finish()
            print('Success: to get data from web')
        except Exception as e:
            self.write('fail to return user history')
            self.finish()
            print('Error:server-HistoryReturn:'
                  'Unable to get value from http package,Reason:')
            print(e)
            return

        dbi=MySQL_Interface()
        checkin_timestamp=int(time.time())
        col_info=dbi.get_col_name('cache_history')
        data=dict(
            latest_time=latest_time,
            latest_timestamp=latest_timestamp,
            container_id=container_id,
            checkin_timestamp=checkin_timestamp
        )
        keys=data.keys()
        insert_data=[[data[item] if item in keys else None for item in col_info]]
        dbi.insert_asList('cache_history',insert_data)
Ejemplo n.º 2
0
    def run(self):
        while True:
            self.dbi = MySQL_Interface()
            t = time.time()
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(t - 12 * 60 * 60))

            #删掉cache_history中的行
            query='delete from cache_history where container_id in (select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null)'\
                .format(time=time_stick)
            self.dbi.update_asQuery(query)

            # 删掉mongodb-assemble factory中的相关值
            select_query = 'select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null'.format(
                time=time_stick)
            res = [x[0] for x in self.dbi.select_asQuery(select_query)]
            client = MongoClient('localhost', 27017)
            db = client['microblog_spider']
            assemble_table = db.assemble_factory
            assemble_table.remove({'container_id': {'$in': res}})

            # 将user info table中超时行的isGettingBlog清空
            query = "update user_info_table set isGettingBlog=null where isGettingBlog<\'{time}\' and update_time is null".format(
                time=time_stick)
            self.dbi.update_asQuery(query)

            # 将cache_history中的残留项去除
            query = "delete from cache_history where is_dealing<\'{time}\' ;".format(
                time=time_stick)
            self.dbi.update_asQuery(query)

            time.sleep(60)
class deal_isGettingBLog_user(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.dbi=MySQL_Interface()

    def run(self):
        while True:
            self.dbi=MySQL_Interface()
            t=time.time()
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-12*60*60))

            #删掉cache_history中的行
            query='delete from cache_history where container_id in (select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null)'\
                .format(time=time_stick)
            self.dbi.update_asQuery(query)

            # 删掉mongodb-assemble factory中的相关值
            select_query='select container_id from user_info_table where isGettingBlog<\'{time}\' and update_time is null'.format(time=time_stick)
            res=[x[0] for x in self.dbi.select_asQuery(select_query)]
            client=MongoClient('localhost',27017)
            db=client['microblog_spider']
            assemble_table=db.assemble_factory
            assemble_table.remove({'container_id':{'$in':res}})

            # 将user info table中超时行的isGettingBlog清空
            query="update user_info_table set isGettingBlog=null where isGettingBlog<\'{time}\' and update_time is null".format(time=time_stick)
            self.dbi.update_asQuery(query)

            time.sleep(60)
Ejemplo n.º 4
0
    def run(self):
        while True:
            client=MongoClient('localhost',27017)
            db=client['microblog_spider']
            mission_mongo=db.update_mission
            assemble_mongo=db.assemble_factory
            current_time=int(time.time())
            target_time=current_time-60*60*6   #将6个小时仍未完成的任务清除出去
            expired_mission=mission_mongo.find({'mission_start':{'$lt':target_time}}).limit(1)
            expired_mission=[x for x in expired_mission]
            if expired_mission.__len__()==0:
                # 如果没有符合要求的过期任务,则休眠
                time.sleep(60)
            else:
                # 如果有过期的任务
                expired_mission=expired_mission[0]
                mission_id=expired_mission['mission_id']
                user_content=expired_mission['user_list']
                user_list=[x['container_id'] for x in user_content]

                # 将mysql中相关用户isGettingBlog清空
                user_list_str=''
                for item in user_list:
                    user_list_str+='\''+str(item)+'\','
                user_list_str=user_list_str[:-1]
                dbi=MySQL_Interface()
                query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \
                    .format(user_list=user_list_str)
                dbi.update_asQuery(query)

                # 将assemble_factory中数据清空
                assemble_mongo.remove({'container_id':mission_id})

                # 将Mongo中该任务从任务表中清空。
                mission_mongo.remove({'mission_id':mission_id})
Ejemplo n.º 5
0
    def  run(self):
        bag=[]
        uid_bag=[]              #与bag类似,只不过存储uid
        bag_size=1000             #100次插入一次
        ready_to_get_col=self.dbi.get_col_name('ready_to_get')
        cache_attends_col=self.dbi.get_col_name('cache_attends')
        while True:
            query='select * from cache_attends limit 5000'
            res=self.dbi.select_asQuery(query)
            if res.__len__()==0:
                if bag.__len__()>0:
                    self.dbi.insert_asList('ready_to_get',bag,unique=True)
                    bag=[]
                    # self.bf.insert_asList(uid_bag,'ready_to_get')
                    uid_bag=[]
                time.sleep(1)
                self.dbi=MySQL_Interface()  #更新dbi
                continue

            print('thread cache attends is working')

            for line in res:
                raw_id=line[cache_attends_col.index('uid')]
                in_user_info=self.bf.isContains(raw_id,'user_info_table')   #此处可优化
                if not in_user_info:
                    data=[line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col]
                    bag.append(data)
                    uid_bag.append(raw_id)
                    if bag.__len__()>bag_size:
                        self.dbi.insert_asList('ready_to_get',bag,unique=True)
                        # self.bf.insert_asList(uid_bag,'ready_to_get')
                        print('insert once')
                        bag=[]
                        uid_bag=[]
                self.dbi.delete_line('cache_attends','uid',raw_id) # 此处可优化
    def post(self):

        # 从客户端获取信息
        try:
            latest_time = self.get_argument('latest_time')
            latest_timestamp = self.get_argument('latest_timestamp')
            container_id = self.get_argument('container_id')
            self.write('success')
            self.finish()
            print('Success: to get data from web')
        except Exception as e:
            self.write('fail to return user history')
            self.finish()
            print('Error:server-HistoryReturn:'
                  'Unable to get value from http package,Reason:')
            print(e)
            return

        dbi = MySQL_Interface()
        checkin_timestamp = int(time.time())
        col_info = dbi.get_col_name('cache_history')
        data = dict(latest_time=latest_time,
                    latest_timestamp=latest_timestamp,
                    container_id=container_id,
                    checkin_timestamp=checkin_timestamp)
        keys = data.keys()
        insert_data = [[
            data[item] if item in keys else None for item in col_info
        ]]
        dbi.insert_asList('cache_history', insert_data)
Ejemplo n.º 7
0
 def run(self):
     while True:
         self.dbi=MySQL_Interface()
         t=time.time()
         time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600))
         query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick)
         # print(query)
         # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick)
         self.dbi.update_asQuery(query)
         time.sleep(1)
Ejemplo n.º 8
0
class deal_cache_user_info(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.dbi = MySQL_Interface()
        self.bf = BloomFilter()

    def run(self):
        while True:
            if self.dbi.is_empty('cache_user_info'):
                time.sleep(2)
                self.dbi = MySQL_Interface()
                continue
            [res, cache_user_info_col] = self.dbi.select_all('cache_user_info')

            time_stick = time.strftime(
                '%Y-%m-%d %H:%M:%S',
                time.localtime(time.time()))  # insert into user info table
            user_info_table_col = self.dbi.get_col_name('user_info_table')
            data = [[
                line[cache_user_info_col.index(col)] if col
                in cache_user_info_col else time_stick if col == 'insert_time'
                else None if col == 'update_time' else None if col ==
                'latest_blog' else None if col == 'isGettingBlog' else ''
                for col in user_info_table_col
            ] for line in res]
            uid_list = [
                line[user_info_table_col.index('uid')] for line in data
            ]
            self.dbi.insert_asList('user_info_table', data,
                                   unique=True)  # 插入 user info table
            self.bf.insert_asList(uid_list, 'user_info_table')
            print('insert {num} users into user info table'.format(
                num=data.__len__()))

            uid_list = [line[cache_user_info_col.index('uid')] for line in res]
            q1 = "delete from {table_name} where uid in ( {id_str_list} ) ;"  # 从cache user info 中删除
            id_str_list = ''
            for i in uid_list:
                id_str_list = id_str_list + '\'' + str(i) + '\'' + ','
            id_str_list = id_str_list[:-1]

            query = q1.format(id_str_list=id_str_list,
                              table_name='cache_user_info')
            self.dbi.cur.execute(query)
            self.dbi.conn.commit()

            query = q1.format(id_str_list=id_str_list,
                              table_name='ready_to_get')
            self.dbi.cur.execute(query)
            self.dbi.conn.commit()
Ejemplo n.º 9
0
 def run(self):
     while True:
         self.dbi = MySQL_Interface()
         num = self.dbi.get_line_num('ready_to_get')
         if num > 150 * 1000:
             query='select m.fans_num from (' \
                   'select fans_num from ready_to_get ' \
                   'ORDER BY fans_num limit 50000' \
                   ') as m order by fans_num desc limit 1'
             res = self.dbi.select_asQuery(query)[0][0]
             query='delete from ready_to_get where fans_num<{num}'\
                 .format(num=res)
             self.dbi.update_asQuery(query)
         else:
             time.sleep(600)
    def  run(self):
        bag=[]
        uid_bag=[]              #与bag类似,只不过存储uid
        bag_size=1000             #100次插入一次
        ready_to_get_col=self.dbi.get_col_name('ready_to_get')
        cache_attends_col=self.dbi.get_col_name('cache_attends')
        while True:
            query='select * from cache_attends limit 5000'
            res=self.dbi.select_asQuery(query)
            if res.__len__()==0:
                if bag.__len__()>0:
                    self.dbi.insert_asList('ready_to_get',bag,unique=True)
                    bag=[]
                    # self.bf.insert_asList(uid_bag,'ready_to_get')
                    uid_bag=[]
                time.sleep(1)
                self.dbi=MySQL_Interface()  #更新dbi
                continue

            print('thread cache attends is working')

            for line in res:
                raw_id=line[cache_attends_col.index('uid')]
                in_user_info=self.bf.isContains(raw_id,'user_info_table')   #此处可优化
                if not in_user_info:
                    data=[line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col]
                    bag.append(data)
                    uid_bag.append(raw_id)
                    if bag.__len__()>bag_size:
                        self.dbi.insert_asList('ready_to_get',bag,unique=True)
                        # self.bf.insert_asList(uid_bag,'ready_to_get')
                        print('insert once')
                        bag=[]
                        uid_bag=[]
                self.dbi.delete_line('cache_attends','uid',raw_id) # 此处可优化
class deal_fetching_user(threading.Thread):
    #定期清理获取时间过长的部分

    def __init__(self):
        threading.Thread.__init__(self)
        self.dbi=MySQL_Interface()

    def run(self):
        while True:
            self.dbi=MySQL_Interface()
            t=time.time()
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600))
            query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick)
            # print(query)
            # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick)
            self.dbi.update_asQuery(query)
            time.sleep(1)
Ejemplo n.º 12
0
class deal_fetching_user(threading.Thread):
    #定期清理获取时间过长的部分

    def __init__(self):
        threading.Thread.__init__(self)
        self.dbi=MySQL_Interface()

    def run(self):
        while True:
            self.dbi=MySQL_Interface()
            t=time.time()
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600))
            query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick)
            # print(query)
            # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick)
            self.dbi.update_asQuery(query)
            time.sleep(1)
class deal_cache_user_info(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.dbi=MySQL_Interface()
        self.bf=BloomFilter()

    def run(self):
        while True:
            if self.dbi.is_empty('cache_user_info'):
                time.sleep(2)
                self.dbi=MySQL_Interface()
                continue
            [res,cache_user_info_col]=self.dbi.select_all('cache_user_info')

            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))      # insert into user info table
            user_info_table_col=self.dbi.get_col_name('user_info_table')
            data= [
                    [
                        line[cache_user_info_col.index(col)] if col in cache_user_info_col
                        else time_stick if col=='insert_time'
                        else None if col=='update_time'
                        else None if col=='latest_blog'
                        else None if col=='isGettingBlog'
                        else ''
                        for col in user_info_table_col
                    ] for line in res]
            uid_list=[line[user_info_table_col.index('uid')] for line in data]
            self.dbi.insert_asList('user_info_table',data,unique=True)          # 插入 user info table
            self.bf.insert_asList(uid_list,'user_info_table')
            print('insert {num} users into user info table'.format(num=data.__len__()))

            uid_list=[line[cache_user_info_col.index('uid')] for line in res]
            q1="delete from {table_name} where uid in ( {id_str_list} ) ;"   # 从cache user info 中删除
            id_str_list=''
            for i in uid_list:
                id_str_list=id_str_list+'\''+str(i)+'\''+','
            id_str_list=id_str_list[:-1]

            query=q1.format(id_str_list=id_str_list,table_name='cache_user_info')
            self.dbi.cur.execute(query)
            self.dbi.conn.commit()

            query=q1.format(id_str_list=id_str_list,table_name='ready_to_get')
            self.dbi.cur.execute(query)
            self.dbi.conn.commit()
 def run(self):
     while True:
         self.dbi=MySQL_Interface()
         t=time.time()
         time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t-3600))
         query="update ready_to_get set is_fetching=null where is_fetching < \'{time}\' ;".format(time=time_stick)
         # print(query)
         # query='select * from ready_to_get where is_fetching < {time}'.format(time=time_stick)
         self.dbi.update_asQuery(query)
         time.sleep(1)
class state_persistance(threading.Thread):
    """
    function: monitor and note the state of proxy pool,including the current
    size of proxy pool, the input speed of new proxy , and the output speed.
    and manage the average size oj of proxy_pool class
    """
    def __init__(self,proxy_pool):
        threading.Thread.__init__(self)
        self.proxy_pool=proxy_pool
        self.dbi=MySQL_Interface()

    def run(self):
        while True:
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            current_size=self.proxy_pool.size()
            [input,output]=self.proxy_pool.update_proxy_state()
            insert_value=[[current_size,time_stick,input,output]]
            self.dbi.insert_asList('proxy_table',insert_value,unique=True)
            time.sleep(server_config.PROXY_MONITOR_GAP)
class control_ready_table(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.dbi=MySQL_Interface()
    def run(self):
        while True:
            self.dbi=MySQL_Interface()
            num=self.dbi.get_line_num('ready_to_get')
            if num>150*1000:
                query='select m.fans_num from (' \
                      'select fans_num from ready_to_get ' \
                      'ORDER BY fans_num limit 50000' \
                      ') as m order by fans_num desc limit 1'
                res=self.dbi.select_asQuery(query)[0][0]
                query='delete from ready_to_get where fans_num<{num}'\
                    .format(num=res)
                self.dbi.update_asQuery(query)
            else:
                time.sleep(600)
def start_selfcheck():  # 启动自检
    print('\n\n********* start to selfcheck *********\n')
    mi = MySQL_Interface()
    if mi.cur:
        print('mysql is connected')
    client = MongoClient('localhost', 27017)
    print('mongodb is connected')
    client.close()
    auto_index()
    print('\n********* selfcheck success  *********\n')
class state_persistance(threading.Thread):
    """
    function: monitor and note the state of proxy pool,including the current
    size of proxy pool, the input speed of new proxy , and the output speed.
    and manage the average size oj of proxy_pool class
    """
    def __init__(self, proxy_pool):
        threading.Thread.__init__(self)
        self.proxy_pool = proxy_pool
        self.dbi = MySQL_Interface()

    def run(self):
        while True:
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            current_size = self.proxy_pool.size()
            [input, output] = self.proxy_pool.update_proxy_state()
            insert_value = [[current_size, time_stick, input, output]]
            self.dbi.insert_asList('proxy_table', insert_value, unique=True)
            time.sleep(server_config.PROXY_MONITOR_GAP)
Ejemplo n.º 19
0
class deal_cache_attends(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        dbi = MySQL_Interface()
        self.dbi = dbi
        self.bf = BloomFilter()

    def run(self):
        bag = []
        uid_bag = []  #与bag类似,只不过存储uid
        bag_size = 1000  #100次插入一次
        ready_to_get_col = self.dbi.get_col_name('ready_to_get')
        cache_attends_col = self.dbi.get_col_name('cache_attends')
        while True:
            query = 'select * from cache_attends limit 5000'
            res = self.dbi.select_asQuery(query)
            if res.__len__() == 0:
                if bag.__len__() > 0:
                    self.dbi.insert_asList('ready_to_get', bag, unique=True)
                    bag = []
                    # self.bf.insert_asList(uid_bag,'ready_to_get')
                    uid_bag = []
                time.sleep(1)
                self.dbi = MySQL_Interface()  #更新dbi
                continue

            print('thread cache attends is working')

            for line in res:
                raw_id = line[cache_attends_col.index('uid')]
                in_user_info = self.bf.isContains(raw_id,
                                                  'user_info_table')  #此处可优化
                if not in_user_info:
                    data = [
                        line[cache_attends_col.index(col)]
                        if col in cache_attends_col else None
                        for col in ready_to_get_col
                    ]
                    bag.append(data)
                    uid_bag.append(raw_id)
                    if bag.__len__() > bag_size:
                        self.dbi.insert_asList('ready_to_get',
                                               bag,
                                               unique=True)
                        # self.bf.insert_asList(uid_bag,'ready_to_get')
                        print('insert once')
                        bag = []
                        uid_bag = []
                self.dbi.delete_line('cache_attends', 'uid', raw_id)  # 此处可优化

    def isInUserInfo(self, in_uid):
        col_user_info = self.dbi.get_col_name('user_info_table')
        query = 'select * from user_info_table where uid={uid}'.format(
            uid=in_uid)
        res = self.dbi.select_asQuery(query)
        if res.__len__() == 0:
            return False
        else:
            return True
class deal_cache_attends(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        dbi=MySQL_Interface()
        self.dbi=dbi
        self.bf=BloomFilter()

    def  run(self):
        bag=[]
        uid_bag=[]              #与bag类似,只不过存储uid
        bag_size=1000             #100次插入一次
        ready_to_get_col=self.dbi.get_col_name('ready_to_get')
        cache_attends_col=self.dbi.get_col_name('cache_attends')
        while True:
            query='select * from cache_attends limit 5000'
            res=self.dbi.select_asQuery(query)
            if res.__len__()==0:
                if bag.__len__()>0:
                    self.dbi.insert_asList('ready_to_get',bag,unique=True)
                    bag=[]
                    # self.bf.insert_asList(uid_bag,'ready_to_get')
                    uid_bag=[]
                time.sleep(1)
                self.dbi=MySQL_Interface()  #更新dbi
                continue

            print('thread cache attends is working')

            for line in res:
                raw_id=line[cache_attends_col.index('uid')]
                in_user_info=self.bf.isContains(raw_id,'user_info_table')   #此处可优化
                if not in_user_info:
                    data=[line[cache_attends_col.index(col)] if col in cache_attends_col else None for col in ready_to_get_col]
                    bag.append(data)
                    uid_bag.append(raw_id)
                    if bag.__len__()>bag_size:
                        self.dbi.insert_asList('ready_to_get',bag,unique=True)
                        # self.bf.insert_asList(uid_bag,'ready_to_get')
                        print('insert once')
                        bag=[]
                        uid_bag=[]
                self.dbi.delete_line('cache_attends','uid',raw_id) # 此处可优化

    def isInUserInfo(self,in_uid):
        col_user_info=self.dbi.get_col_name('user_info_table')
        query='select * from user_info_table where uid={uid}'.format(uid=in_uid)
        res=self.dbi.select_asQuery(query)
        if res.__len__()==0:
            return False
        else:
            return True
 def __init__(self, proxy_pool):
     threading.Thread.__init__(self)
     self.proxy_pool = proxy_pool
     self.dbi = MySQL_Interface()
 def __init__(self):
     threading.Thread.__init__(self)
     self.dbi=MySQL_Interface()
    def post(self):

        try:
            user_basic_info = self.get_argument('user_basic_info')
            attends = self.get_argument('user_attends')
            user_basic_info = eval(user_basic_info)
            attends = eval(attends)
            self.write('success to return user info')
            self.finish()
        except:
            self.write('fail to return user info')
            self.finish()
            return

        try:
            dbi = MySQL_Interface()
        except:
            print('unable to connect to MySql DB')

        try:
            if attends.__len__() > 0:  #store attends info
                table_name = 'cache_attends'
                attends_col_info = dbi.get_col_name(table_name)
                keys = attends[0].keys()
                attends = [[
                    line[i] if i in keys else '' for i in attends_col_info
                ] for line in attends]
                fans_col_pos = attends_col_info.index('fans_num')
                insert_attends = []
                for line in attends:
                    if line[fans_col_pos] > 1000:
                        insert_attends.append(line)
                dbi.insert_asList(table_name, insert_attends, unique=True)
                print('Success : attends of {uid} is stored in {tname}'.format(
                    uid=user_basic_info['uid'], tname=table_name))
            else:
                pass
        except Exception as e:
            print(e)
            path = "temp" + os.sep + "{uid}_attends.pkl".format(
                uid=user_basic_info['uid'])
            print(
                'unable to store attends of {uid}, it will be stored '.format(
                    uid=user_basic_info['uid']))
            FI.save_pickle(attends, path)

        try:
            atten_num_real = user_basic_info['attends_num']
            atten_num_get = attends.__len__()
            user_basic_info['accuracy'] = atten_num_get  # 实际获取到的关注数目
            col_info = dbi.get_col_name(
                'cache_user_info')  # store user basic info
            keys = user_basic_info.keys()
            data = [user_basic_info[i] if i in keys else '' for i in col_info]
            dbi.insert_asList('cache_user_info', [data], unique=True)
            print('Success : basic info of {uid} is stored in cache_user_info'.
                  format(uid=user_basic_info['uid']))
        except Exception as e:
            print(e)
            path = 'temp' + os.sep + '{uid}_basic_info.pkl'.format(
                uid=user_basic_info['uid'])
            print('unable to store basic info of {uid} , it will be stored'.
                  format(uid=user_basic_info['uid']))
            FI.save_pickle(user_basic_info, path)

        try:
            if attends.__len__() > 0:  # store atten connection web
                from_uid = user_basic_info['uid']
                from_fans_num = user_basic_info['fans_num']
                from_blog_num = user_basic_info['blog_num']
                data = [[
                    from_uid, from_fans_num, from_blog_num,
                    str(x[attends_col_info.index('uid')]),
                    str(x[attends_col_info.index('fans_num')]),
                    str(x[attends_col_info.index('blog_num')])
                ] for x in attends]
                dbi.insert_asList('cache_atten_web', data)
                print(
                    'Success : conn web of {uid} is stored in cache_atten_web'.
                    format(uid=user_basic_info['uid']))
            else:
                pass
        except Exception as e:
            print(e)
            path = '{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store atten web of {uid} , it will be stored'.
                  format(uid=user_basic_info['uid']))
            FI.save_pickle(data, path)
        self.bit_size=1<<15
        self.seeds=[5,7,11,13,31,37,61]
        self.r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0)
        self.hashFunc=[]
        for i in range(self.seeds.__len__()):
            self.hashFunc.append(SimpleHash(self.bit_size,self.seeds[i]))

    def isContains(self,str_input,name):
        if str_input==None:
            return False
        if str_input.__len__()==0:
            return False
        ret=True
        for f in self.hashFunc:
            loc=f.hash(str_input)
            ret=ret & self.r.getbit(name,loc)
        return ret

    def insert(self,str_input,name):
        for f in self.hashFunc:
            loc=f.hash(str_input)
            self.r.setbit(name,loc,1)

dbi=MySQL_Interface(dbname='microblog_spider')
r=redis.StrictRedis(host='127.0.0.1',port=6379,db=0)
query='select uid from user_info_table ;'
uid=dbi.select_asQuery(query)
uid=[x[0] for x in uid]
bf=BloomFilter()
for id in uid:
    bf.insert(id,'user_info_table')
 def __init__(self,proxy_pool):
     threading.Thread.__init__(self)
     self.proxy_pool=proxy_pool
     self.dbi=MySQL_Interface()
 def __init__(self):
     threading.Thread.__init__(self)
     self.dbi=MySQL_Interface()
     self.bf=BloomFilter()
Ejemplo n.º 27
0
 def __init__(self):
     threading.Thread.__init__(self)
     dbi = MySQL_Interface()
     self.dbi = dbi
     self.bf = BloomFilter()
Ejemplo n.º 28
0
__author__ = 'multiangle'

from DB_Interface import MySQL_Interface
import json
import networkx as nx

dbi = MySQL_Interface()

# create table (select * from user_info_table order by fans_num limit 1000)

[web_info, col_info] = dbi.select_all('temp_table2')
select_web = []
select_user = {}
for atte in web_info:
    if (atte[1], atte[0]) in web_info:
        select_web.append(list(atte))
        select_user[atte[1]] = 1
        select_user[atte[0]] = 1
select_user = select_user.keys()

G = nx.Graph()
G.add_nodes_from(select_user)
G.add_edges_from(select_web)
nx.write_gexf(G, 'weibo_node1000.gexf')
    def get(self):
        global proxy
        uuid = str(self.get_argument('uuid'))
        task_id = self.task_assign(uuid)

        if proxy.get_ave_proxy_size(
        ) < 30:  # check the size of current proxy size
            self.write('no task')
            self.finish()
            return

        if task_id == -1:  # checi if this uuid is valid
            self.write('no task')
            self.finish()
            return

        if task_id == 1:  # get the social web of certain user
            dbi = MySQL_Interface()
            query = 'select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;'
            res = dbi.select_asQuery(query)
            if res.__len__() == 0:
                self.write('no task')
                self.finish()
                return
            res = res[0]
            col_info = dbi.get_col_name('ready_to_get')
            uid = res[col_info.index('uid')]

            self.write('{uid},connect'.format(uid=uid))
            self.finish()

            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\
                .format(t_time=time_stick,uid=uid)
            dbi.update_asQuery(query)

        if task_id == 2:  # get the history microblog of a certain user
            dbi = MySQL_Interface()
            query='select container_id,blog_num from user_info_table ' \
                  'where (isGettingBlog is null and update_time is null and blog_num<{valve} and blog_num>100)' \
                  'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE)
            # query='select container_id,blog_num from user_info_table ' \
            #       'order by rand() limit 1 ;'
            res = dbi.select_asQuery(query)
            if res.__len__() == 0:
                self.write('no task')
                self.finish()
                return
            [container_id, blog_num] = res[0]
            self.write('{c_id};{blog},history'.format(c_id=container_id,
                                                      blog=blog_num))
            self.finish()
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\
                .format(t_time=time_stick,cid=container_id)
            dbi.update_asQuery(query)

        if task_id == 3:  # get the history microblog of a certain user
            dbi = MySQL_Interface()
            query='select container_id,blog_num from user_info_table ' \
                  'where (isGettingBlog is null and update_time is null and blog_num>={valve}  and blog_num>100)' \
                  'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE)
            # query='select container_id,blog_num from user_info_table ' \
            #       'order by rand() limit 1 ;'
            [container_id, blog_num] = dbi.select_asQuery(query)[0]
            self.write('{c_id};{blog},history'.format(c_id=container_id,
                                                      blog=blog_num))
            self.finish()
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;" \
                .format(t_time=time_stick,cid=container_id)
            dbi.update_asQuery(query)

        if task_id == 4 or task_id == 5 or task_id == 100:  # this part is in test
            dbi = MySQL_Interface()
            current_time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(time.time()))
            target_time_stick = time.strftime(
                '%Y-%m-%d %H:%M:%S',
                time.localtime(time.time() - 60 * 60 * 24 * 1))  #提早5天
            if task_id == 4:
                batch_size = 100
            elif task_id == 5:
                batch_size = 200
            else:
                batch_size = 10
            query='select container_id,update_time,latest_blog from user_info_table ' \
                  'where update_time<\'{target_time}\' and isGettingBlog is null and blog_num>10 order by fans_num desc limit {batch}' \
                .format(target_time=target_time_stick,batch=batch_size)
            print(query)
            res = dbi.select_asQuery(query)

            # 将从mysql中取得的用户列表加上必要的变量以后发送给客户端
            res = [[
                line[0],
                int(time.mktime(line[1].timetuple())),
                int(time.mktime(line[2].timetuple()))
            ] for line in res]
            res_cp = res

            if res_cp.__len__() == 0:  # if no task ,then return "no task"
                print('*** warning: no avaliable update mission ***')
                self.write('no task')
                self.finish()
                return

            # print('debug from task handler')
            # pprint(res_cp)
            res = [
                line[0] + '-' + str(line[1]) + '-' + str(line[2])
                for line in res
            ]
            inn = ''
            for item in res:
                inn += item + ';'
            inn = inn[0:-1]
            # uid-stamp;uid-timestamp;...;,update  (the formation of order)
            mission_id = random_str(15)
            commend = '{list};{task_id},update'.format(list=inn,
                                                       task_id=mission_id)
            # 传送给客户端的指令格式: ContainerId-UpdateTime-LatestBlog;...;...;...,update
            self.write(commend)
            self.finish()

            # 将用户列表,任务id,以及任务开始时间存入mongodb
            u_list = [
                dict(container_id=x[0], update_time=x[1], latest_blog=x[2])
                for x in res_cp
            ]
            data_toMongo = dict(mission_id=mission_id,
                                user_list=u_list,
                                mission_start=int(time.time()))
            client = MongoClient('localhost', 27017)
            db = client['microblog_spider']
            collec = db.update_mission
            collec.insert(data_toMongo)

            # 将相关内容从mysql中设置isGettingBlog
            user_list_str = ''
            for line in res_cp:
                user_list_str += '\'{cid}\','.format(cid=line[0])
            user_list_str = user_list_str[:-1]
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            query='update user_info_table set isGettingBlog=\'{time}\' where container_id in ({ulist})'\
                .format(time=time_stick,ulist=user_list_str)
            dbi.update_asQuery(query)
Ejemplo n.º 30
0
__author__ = 'multiangle'

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from DB_Interface import MySQL_Interface

dbi = MySQL_Interface()
[select_user, select_user_col] = dbi.select_all('select_user')
user_list = [line[select_user_col.index('name')] for line in select_user]
user_id = [line[select_user_col.index('uid')] for line in select_user]
[atten_web, atten_web_col] = dbi.select_all('select_atten')
atten_list = [[
    line[atten_web_col.index('from_uid')], line[atten_web_col.index('to_uid')]
] for line in atten_web]
# temp_atten_list=[]
# for line in atten_list:
#     try:
#         temp=[user_list[user_id.index(line[0])],user_list[user_id.index(line[1])]]
#         temp_atten_list.append(temp)
#     except:
#         pass
# atten_list=temp_atten_list

print(atten_list.__len__())
sig_list = [line[0] + line[1] for line in atten_list]
select_atten_list = []
for line in atten_list:
    temp_sig_a = line[0] + line[1]
    temp_sig_b = line[1] + line[0]
    def run(self):
        while True:
            start_time = time.time()
            dbi=MySQL_Interface()
            col_info=dbi.get_col_name('cache_history')
            query='select * from cache_history where is_dealing is null order by checkin_timestamp limit 1'

            mysql_res=dbi.select_asQuery(query)
            if mysql_res.__len__()==0:       # cache_history表为空时,睡眠1秒,跳过此次循环
                time.sleep(1)
                continue

            mysql_res=mysql_res[0]

            # todo for delete-----
            print('debug->start to deal with a new task')
            print('debug->mysql_res: ')
            print(mysql_res)
            #------------------------

            container_id=mysql_res[col_info.index('container_id')]
            print('debug->container_id: {cid}'.format(cid=container_id))
            latest_time=mysql_res[col_info.index('latest_time')]
            latest_timestamp=mysql_res[col_info.index('latest_timestamp')]
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(time=time_stick, cid = container_id)
            # todo for delete-----
            print('debug->query1 : {q}'.format(q=query))
            # ------------------------
            dbi.update_asQuery(query)

            client = MongoClient('localhost', 27017)
            db = client['microblog_spider']
            assemble_table = db.assemble_factory
            res = assemble_table.find({'container_id': container_id}, {'current_id': 1, 'total_num': 1})
            id_list = [x['current_id'] for x in res]
            num = int([x['total_num'] for x in assemble_table.find({'container_id': container_id}).limit(1)][0])
            ## todo for delete-----
            print('debug->id_list_len: {len}'.format(len=id_list.__len__()))
            print('debug->num: {n}'.format(n=num))
            # ------------------------
            # 检查是否所有包裹已经到齐
            check_state = True
            if id_list.__len__() < num:
                print('server->HistoryReport:The package is not complete, retry to catch data')
                check_state = False

            if check_state:
                # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb
                # 将装配车间中的相关数据删除
                # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog

                # 从mysql获取该用户信息
                try:
                    query = 'select * from user_info_table where container_id=\'{cid}\'' \
                        .format(cid=container_id)
                    user_info = dbi.select_asQuery(query)[0]
                    # todo fro debug-------------
                    print('task {cid} :debug->query2: {q}'.format(q=query,cid=container_id))
                    print('task {cid} debug->user_info:'.format(cid = container_id))
                    print(user_info)
                    # --------------------------------
                    col_name = dbi.get_col_name('user_info_table')
                except Exception as e:
                    print('task {cid} :Error:server-HistoryReturn:'
                          'No such user in MySQL.user_info_table,Reason:'.format(cid = container_id))
                    print(e)

                # 将数据从assemble factory中提取出来
                try:
                    data_list = assemble_table.find({'container_id':container_id}, {'data': 1 , 'current_id': 1})
                    data_list_ori = [x for x in data_list]
                    data_list = [x['data'] for x in data_list_ori]
                    id_list = [x['current_id'] for x in data_list_ori]
                    data_list_ori = None
                    # todo fro debug-------------
                    print('task {cid} debug->datalist: {len}'.format(len = data_list.__len__(),cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print('Error:server-HistoryReturn:'
                        'Unable to get data from MongoDB, assemble factory,Reason:')
                    print(e)

                # 长度大于预期,说明有重复信息,需要去重
                if  id_list.__len__() > num :
                    unique_data_list = []
                    check_dict = {}
                    for i in range(id_list.__len__()) :
                        try:
                            # 这里使用字典去重,(算是hash吧)
                            check_dict[str(id_list[i])]
                            continue
                        except:
                            check_dict[str(id_list[i])] = True
                            unique_data_list.append(data_list[i])
                            # print('data_list.len :{len}'.format(len=data_list.__len__()))
                            # print('id_list.len :{len}'.format(len=id_list.__len__()))
                            # print(i)
                    data_list = unique_data_list

                # 将碎片拼接
                try:
                    data_final = []
                    for i in data_list:
                        data_final = data_final+i
                    # todo fro debug-------------
                    print('task {cid} debug->数据拼接完毕,len {len}'.format(len=data_final.__len__(),cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print('Error:server-HistoryReport:'
                          'Unable to contact the pieces of information,Reason:')
                    print(e)

                # 将本次信息录入accuracy_table 用以进一步分析
                blog_len = data_final.__len__()
                wanted_blog_len = user_info[col_name.index('blog_num')]
                blog_accuracy = blog_len/wanted_blog_len
                time_stick = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \
                    .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len)
                dbi.insert_asQuery(query)

                # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容
                try:
                    if not user_info[col_name.index('update_time')]:
                        # 将数据存入 Mongodb 的formal collection
                        save_data_seperately(data_final)
                        print('task {cid} Success: Data has saved in Mongodb, size is {size}'
                              .format(size=sys.getsizeof(data_final),cid=container_id))

                        # # 将关键信息录入Mydql
                        query = 'update user_info_table set ' \
                              'update_time=\'{up_time}\',' \
                              'latest_blog=\'{latest_blog}\',' \
                              'isGettingBlog=null ' \
                              'where container_id=\'{cid}\';'\
                            .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        # query='update user_info_table set ' \
                        #       'update_time=\'{up_time}\',' \
                        #       'latest_blog=\'{latest_blog}\'' \
                        #       'where container_id=\'{cid}\';' \
                        #     .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的
                        dbi.update_asQuery(query)
                        print('task {cid} Success: insert user into MongoDB, the num of data is {len}'
                              .format(len=blog_len,cid=container_id))
                    else:
                        query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                            .format(cid=container_id)
                        dbi.update_asQuery(query)

                except Exception as e:
                    print('task {cid} Error:server->HistoryReport:'
                          'Reason:'.format(cid=container_id))
                    print(e)
            else:
                # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除
                print('task {cid} :Error: the package is not complete ,{a} of {b}'
                      .format(a=id_list.__len__(),b=num,cid=container_id))
                query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                    .format(cid=container_id)
                dbi.update_asQuery(query)

            # 将数据从assemble factory 去掉
            assemble_table.remove({'container_id':container_id})
            print('task {cid} Success: Data has been removed from assemble factory'
                    .format(cid=container_id))

            # 将cache_history中的相应行删掉,表示已经处理完该事物了
            query='delete from cache_history where container_id=\'{cid}\'' \
                .format(cid=container_id)
            dbi.update_asQuery(query)

            end_time = time.time()
            deal_time = end_time - start_time
            print('task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds'
                  .format(cid = container_id, len = data_final.__len__(), t = deal_time))
Ejemplo n.º 32
0
        self.seeds = [5, 7, 11, 13, 31, 37, 61]
        self.r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
        self.hashFunc = []
        for i in range(self.seeds.__len__()):
            self.hashFunc.append(SimpleHash(self.bit_size, self.seeds[i]))

    def isContains(self, str_input, name):
        if str_input == None:
            return False
        if str_input.__len__() == 0:
            return False
        ret = True
        for f in self.hashFunc:
            loc = f.hash(str_input)
            ret = ret & self.r.getbit(name, loc)
        return ret

    def insert(self, str_input, name):
        for f in self.hashFunc:
            loc = f.hash(str_input)
            self.r.setbit(name, loc, 1)


dbi = MySQL_Interface(dbname='microblog_spider')
r = redis.StrictRedis(host='127.0.0.1', port=6379, db=0)
query = 'select uid from user_info_table ;'
uid = dbi.select_asQuery(query)
uid = [x[0] for x in uid]
bf = BloomFilter()
for id in uid:
    bf.insert(id, 'user_info_table')
__author__ = 'multiangle'

from DB_Interface import MySQL_Interface
import json
import networkx as nx

dbi=MySQL_Interface()

# create table (select * from user_info_table order by fans_num limit 1000)

[web_info,col_info]=dbi.select_all('temp_table2')
select_web=[]
select_user={}
for atte in web_info:
    if (atte[1],atte[0]) in web_info:
        select_web.append(list(atte))
        select_user[atte[1]]=1
        select_user[atte[0]]=1
select_user=select_user.keys()

G=nx.Graph()
G.add_nodes_from(select_user)
G.add_edges_from(select_web)
nx.write_gexf(G,'weibo_node1000.gexf')
Ejemplo n.º 34
0
    def run(self):
        client = MongoClient('localhost', 27017)
        while True:
            db = client['microblog_spider']
            mission_mongo = db.update_mission
            # 表示需要处理,但是现在无人处理的任务
            res = mission_mongo.find({
                'isReported': {
                    '$ne': None
                },
                'isDealing': None
            }).limit(1)
            res = [x for x in res]

            # 若没有待完成的任务,则该线程休眠1秒然后继续
            if res.__len__() == 0:
                time.sleep(1)
                continue

            # 提取出需要处理的任务
            task = res[0]
            task.pop('_id')
            mission_id = task['mission_id']
            user_content = task['user_list']

            # 将任务列表中的isDealing设置当前时间,表示当前任务开始受理
            mission_mongo.update({'mission_id': mission_id},
                                 {'$set': {
                                     'isDealing': int(time.time())
                                 }})
            print('Update Mission :{mi} set isDealing as {t}'.format(
                mi=mission_id, t=int(time.time())))

            # 获取包裹id和总包裹数
            assemble_table = db.assemble_factory
            res = assemble_table.find({'container_id': mission_id}, {
                'current_id': 1,
                'total_num': 1
            })
            id_list = [x['current_id'] for x in res]
            check_state = True
            try:
                num = int([
                    x['total_num']
                    for x in assemble_table.find({
                        'container_id': mission_id
                    }).limit(1)
                ][0])
            except:
                print(
                    'deal_update_mission :{mi} can not get num info from mongo'
                    .format(mi=mission_id))
                num = 100000000
                check_state = False

            #检查是否所有包裹已经到齐
            if id_list.__len__() < num:
                print(
                    'Update Mission :{mi} The package is not complete, retry to catch data'
                    .format(mi=mission_id))
                check_state = False

            if check_state:
                # 增加当前时间的转发,点赞和评论数,便于追踪
                # 如果所有子包完毕,则将数据放入正式数据库mongodb各已经收集月份表和最近半月表

                # 将数据从assemble factory中提取出来
                try:
                    data_list = assemble_table.find(
                        {'container_id': mission_id}, {
                            'data': 1,
                            'current_id': 1
                        })
                    data_list_ori = [x for x in data_list]
                    data_list = [x['data'] for x in data_list_ori]
                    id_list = [x['current_id'] for x in data_list_ori]
                    data_list_ori = None
                    print(
                        'Update Mission :{mi} success->datalist: {len}'.format(
                            len=data_list.__len__(), mi=mission_id))
                except Exception as e:
                    print(
                        'Update Mission :{mi} Error:server_database-deal_update_mission:'
                        'Unable to get data from MongoDB, assemble factory,Reason:'
                        .format(mi=mission_id))
                    print(e)

                # 长度大于预期,说明有重复信息,需要去重
                if id_list.__len__() > num:
                    unique_data_list = []
                    check_dict = {}
                    for i in range(id_list.__len__()):
                        try:
                            # 这里使用字典去重,(算是hash吧)
                            check_dict[str(id_list[i])]
                            continue
                        except:
                            check_dict[str(id_list[i])] = True
                            unique_data_list.append(data_list[i])
                    data_list = unique_data_list

                # 将碎片拼接
                try:
                    data_final = []
                    for i in data_list:
                        data_final = data_final + i
                    print('Update Mission :{mi} success->数据拼接完毕,len {len}'.
                          format(len=data_final.__len__(), mi=mission_id))
                except Exception as e:
                    print(
                        'Update Mission :{mi} Error:server-HistoryReport:'
                        'Unable to contact the pieces of information,Reason:'.
                        format(mi=mission_id))
                    print(e)

                # 增加当前时间的转发,点赞和评论数,便于追踪,并制作成UpdateMany对象
                user_list = [x['container_id'] for x in user_content]
                user_list_str = ''
                for item in user_list:
                    user_list_str += '\'' + str(item) + '\','
                user_list_str = user_list_str[:-1]

                def temp_add_trace(line):
                    msg_id = line['id']
                    current_status = dict(
                        comments_count=line['comments_count'],
                        attitudes_count=line['attitudes_count'],
                        reposts_count=line['reposts_count'])
                    t = int(time.time())
                    t_str = str(t)
                    line['status_trace.{date}'.format(
                        date=t_str)] = current_status
                    update_item = UpdateMany({'id': msg_id}, {'$set': line},
                                             upsert=True)
                    return update_item

                requests = [temp_add_trace(x) for x in data_final]
                latest_mongo = db.latest_history
                latest_mongo.bulk_write(requests)
                print(
                    'Update Mission :{mi} Success: server_database:UpdateMany列表生成,'
                    '写入latest_history表成功,{len}'.format(len=requests.__len__(),
                                                       mi=mission_id))

                # 将获得数据写入各按月份分类的聚合中
                table_list = []
                request_updateMonth = []
                for i in range(data_final.__len__()):
                    temp_time = data_final[i]['created_at']
                    temp_table_name = 'user_{year}_{month}'.format(
                        year=temp_time[0:4], month=temp_time[5:7])
                    if temp_table_name in table_list:
                        request_updateMonth[table_list.index(
                            temp_table_name)].append(requests[i])
                    else:
                        table_list.append(temp_table_name)
                        request_updateMonth.append([requests[i]])
                print('the number of ori table is {len}'.format(
                    len=request_updateMonth.__len__()))
                print(table_list)
                selected_num = 5
                if table_list.__len__() > selected_num:
                    packed = [[table_list[i], request_updateMonth[i]]
                              for i in range(table_list.__len__())]
                    packed = sorted(packed, key=lambda x: x[0], reverse=True)
                    packed = packed[:selected_num]
                    table_list = [x[0] for x in packed]
                    request_updateMonth = [x[1] for x in packed]
                    print('the number of dealed table is {len}'.format(
                        len=request_updateMonth.__len__()))
                    print(table_list)
                if request_updateMonth.__len__() >= 3:
                    print('{a}-{b}-{c}'.format(
                        a=request_updateMonth[0].__len__(),
                        b=request_updateMonth[1].__len__(),
                        c=request_updateMonth[2].__len__()))

                for i in range(table_list.__len__()):
                    collection = eval('db.{name}'.format(name=table_list[i]))
                    # todo for debug----------------------------
                    print('table {x} is started'.format(x=table_list[i]))
                    #---------------------------------------------------
                    if request_updateMonth[i].__len__() > 0:
                        try:
                            collection.bulk_write(request_updateMonth[i])
                        except Exception as e:
                            print(
                                'Update Mission :{mi} fail to update table {t}'
                                .format(mi=mission_id, t=table_list[i]))

                print(
                    'Update Mission :{mi} Success:server_database:所获的数组已经写入按月分类聚合中'
                    .format(mi=mission_id))

                # 清理Mydql,更新相关行数中的update_time和latest_blog
                time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(time.time()))
                # 找出各用户的最近更新时间
                latest_list = [0] * user_list.__len__()
                for line in data_final:
                    this_timestick = int(line['created_timestamp'])
                    this_container = '100505' + str(line['user_id'])
                    try:
                        index = user_list.index(this_container)
                        if latest_list[index] < this_timestick:
                            latest_list[index] = this_timestick
                    except:
                        print('error:server_database->deal_update_mission:'
                              'container {id} is not in user_list'.format(
                                  id=this_container))

                # 将各用户最近更新时间固化为mysql更新语句。
                case_list = ''
                updated_user_list = ''
                for i in range(latest_list.__len__()):
                    if latest_list[i] > user_content[i]['latest_blog']:
                        time_stick_inner = time.strftime(
                            '%Y-%m-%d %H:%M:%S',
                            time.localtime(latest_list[i]))
                        case_list += ' when \'{cid}\' then \'{tstick}\' '.format(
                            cid=user_list[i], tstick=time_stick_inner)
                        updated_user_list += '\'{cid}\','.format(
                            cid=user_list[i])
                updated_user_list = updated_user_list[:-1]
                # 构建mysql更新语句
                query1='update user_info_table set update_time=\'{time}\' where container_id in ( {user_list} ) ;'\
                    .format(time=time_stick,user_list=user_list_str)
                query2='update user_info_table set latest_blog= case container_id {case_list} end where container_id in ( {ulist2} ) ;'\
                    .format(case_list=case_list,ulist2=updated_user_list)
                dbi = MySQL_Interface()
                dbi.update_asQuery(query2)
                dbi.update_asQuery(query1)
                print(
                    'Update Mission :{mi} Success:server_database: UpdateTime和LatestBlog选项已更新'
                    .format(mi=mission_id))
                if user_list_str.__len__() > 0:
                    query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \
                        .format(user_list=user_list_str)
                    dbi.update_asQuery(query)
                print(
                    'Update Mission :{mi} Success:erver_database: isGettingBlog选项已清除'
                    .format(mi=mission_id))

            else:
                if user_list_str.__len__() > 0:
                    query='update user_info_table set isGettingBlog=null where container_id in ({user_list});'\
                        .format(user_list=user_list_str)
                    dbi = MySQL_Interface()
                    dbi.update_asQuery(query)

            # 将assemble_factory中与当前任务有关数据清空
            assemble_table.remove({'container_id': mission_id})
            print(
                'Update Mission :{mi} Success:server_database: assemble_factory in Mongo is cleared'
                .format(mi=mission_id))

            # 将mongodb,任务列表中当前任务项清空
            mission_mongo.remove({'mission_id': mission_id})
            print(
                'Update Mission :{mi} Success:server_database: this mission is cleared'
                .format(mi=mission_id))
    def run(self):
        client=MongoClient('localhost',27017)
        while True:
            db=client['microblog_spider']
            mission_mongo=db.update_mission
            # 表示需要处理,但是现在无人处理的任务
            res=mission_mongo.find({'isReported':{'$ne':None},'isDealing':None}).limit(1)
            res=[x for x in res]

            # 若没有待完成的任务,则该线程休眠1秒然后继续
            if res.__len__()==0:
                time.sleep(1)
                continue

            # 提取出需要处理的任务
            task=res[0]
            task.pop('_id')
            mission_id=task['mission_id']
            user_content=task['user_list']

            # 将任务列表中的isDealing设置当前时间,表示当前任务开始受理
            mission_mongo.update({'mission_id':mission_id},{'$set':{'isDealing':int(time.time())}})
            print('Update Mission :{mi} set isDealing as {t}'.format(mi=mission_id,t=int(time.time())))

            # 获取包裹id和总包裹数
            assemble_table=db.assemble_factory
            res=assemble_table.find({'container_id':mission_id},{'current_id':1,'total_num':1})
            id_list=[x['current_id'] for x in res]
            check_state=True
            try:
                num=int([x['total_num'] for x in assemble_table.find({'container_id':mission_id}).limit(1)][0])
            except:
                print('deal_update_mission :{mi} can not get num info from mongo'
                      .format(mi=mission_id))
                num = 100000000
                check_state = False

            #检查是否所有包裹已经到齐
            if id_list.__len__()<num:
                print('Update Mission :{mi} The package is not complete, retry to catch data'
                      .format(mi=mission_id))
                check_state=False

            if check_state:
                # 增加当前时间的转发,点赞和评论数,便于追踪
                # 如果所有子包完毕,则将数据放入正式数据库mongodb各已经收集月份表和最近半月表

                # 将数据从assemble factory中提取出来
                try:
                    data_list = assemble_table.find({'container_id':mission_id}, {'data': 1 , 'current_id': 1})
                    data_list_ori = [x for x in data_list]
                    data_list = [x['data'] for x in data_list_ori]
                    id_list = [x['current_id'] for x in data_list_ori]
                    data_list_ori = None
                    print('Update Mission :{mi} success->datalist: {len}'.format(len=data_list.__len__(),mi=mission_id))
                except Exception as e:
                    print('Update Mission :{mi} Error:server_database-deal_update_mission:'
                          'Unable to get data from MongoDB, assemble factory,Reason:'.format(mi=mission_id))
                    print(e)

                # 长度大于预期,说明有重复信息,需要去重
                if id_list.__len__() > num :
                    unique_data_list = []
                    check_dict = {}
                    for i in range(id_list.__len__()) :
                        try:
                            # 这里使用字典去重,(算是hash吧)
                            check_dict[str(id_list[i])]
                            continue
                        except:
                            check_dict[str(id_list[i])] = True
                            unique_data_list.append(data_list[i])
                    data_list = unique_data_list

                # 将碎片拼接
                try:
                    data_final=[]
                    for i in data_list:
                        data_final=data_final+i
                    print('Update Mission :{mi} success->数据拼接完毕,len {len}'
                          .format(len=data_final.__len__(),mi=mission_id))
                except Exception as e:
                    print('Update Mission :{mi} Error:server-HistoryReport:'
                          'Unable to contact the pieces of information,Reason:'.format(mi=mission_id))
                    print(e)

                # 增加当前时间的转发,点赞和评论数,便于追踪,并制作成UpdateMany对象
                user_list=[x['container_id'] for x in user_content]
                user_list_str=''
                for item in user_list:
                    user_list_str+='\''+str(item)+'\','
                user_list_str=user_list_str[:-1]

                def temp_add_trace(line):
                    msg_id=line['id']
                    current_status=dict(
                        comments_count=line['comments_count'],
                        attitudes_count=line['attitudes_count'],
                        reposts_count=line['reposts_count']
                    )
                    t=int(time.time())
                    t_str=str(t)
                    line['status_trace.{date}'.format(date=t_str)]=current_status
                    update_item=UpdateMany({'id':msg_id},{'$set':line},upsert=True)
                    return update_item

                requests=[temp_add_trace(x) for x in data_final]
                latest_mongo=db.latest_history
                latest_mongo.bulk_write(requests)
                print('Update Mission :{mi} Success: server_database:UpdateMany列表生成,'
                      '写入latest_history表成功,{len}'.format(len=requests.__len__(),mi=mission_id))

                # 将获得数据写入各按月份分类的聚合中
                table_list=[]
                request_updateMonth=[]
                for i in range(data_final.__len__()):
                    temp_time=data_final[i]['created_at']
                    temp_table_name='user_{year}_{month}'.format(year=temp_time[0:4],month=temp_time[5:7])
                    if temp_table_name in table_list:
                        request_updateMonth[table_list.index(temp_table_name)].append(requests[i])
                    else:
                        table_list.append(temp_table_name)
                        request_updateMonth.append([requests[i]])
                print('the number of ori table is {len}'.format(len=request_updateMonth.__len__()))
                print(table_list)
                selected_num = 5
                if table_list.__len__()>selected_num:
                    packed = [[table_list[i],request_updateMonth[i]]
                              for i in range(table_list.__len__())]
                    packed = sorted(packed, key=lambda x:x[0], reverse=True)
                    packed = packed[:selected_num]
                    table_list = [x[0] for x in packed]
                    request_updateMonth = [x[1] for x in packed]
                    print('the number of dealed table is {len}'.format(len=request_updateMonth.__len__()))
                    print(table_list)
                if request_updateMonth.__len__()>=3:
                    print('{a}-{b}-{c}'.format( a=request_updateMonth[0].__len__(),
                                                b=request_updateMonth[1].__len__(),
                                                c=request_updateMonth[2].__len__()
                                                ))

                for i in range(table_list.__len__()):
                    collection=eval('db.{name}'.format(name=table_list[i]))
                    # todo for debug----------------------------
                    print('table {x} is started'.format(x=table_list[i]))
                    #---------------------------------------------------
                    if request_updateMonth[i].__len__()>0:
                        try:
                            collection.bulk_write(request_updateMonth[i])
                        except Exception as e:
                            print('Update Mission :{mi} fail to update table {t}'
                                  .format(mi=mission_id,t=table_list[i]))

                print('Update Mission :{mi} Success:server_database:所获的数组已经写入按月分类聚合中'
                      .format(mi=mission_id))

                # 清理Mydql,更新相关行数中的update_time和latest_blog
                time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # 找出各用户的最近更新时间
                latest_list=[0]*user_list.__len__()
                for line in data_final:
                    this_timestick=int(line['created_timestamp'])
                    this_container='100505'+str(line['user_id'])
                    try:
                        index=user_list.index(this_container)
                        if latest_list[index]<this_timestick:
                            latest_list[index]=this_timestick
                    except:
                        print('error:server_database->deal_update_mission:'
                              'container {id} is not in user_list'.format(id=this_container))

                # 将各用户最近更新时间固化为mysql更新语句。
                case_list=''
                updated_user_list=''
                for i in range(latest_list.__len__()):
                    if latest_list[i]>user_content[i]['latest_blog'] :
                        time_stick_inner=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(latest_list[i]))
                        case_list+=' when \'{cid}\' then \'{tstick}\' '.format(cid=user_list[i],tstick=time_stick_inner)
                        updated_user_list+='\'{cid}\','.format(cid=user_list[i])
                updated_user_list=updated_user_list[:-1]
                # 构建mysql更新语句
                query1='update user_info_table set update_time=\'{time}\' where container_id in ( {user_list} ) ;'\
                    .format(time=time_stick,user_list=user_list_str)
                query2='update user_info_table set latest_blog= case container_id {case_list} end where container_id in ( {ulist2} ) ;'\
                    .format(case_list=case_list,ulist2=updated_user_list)
                dbi=MySQL_Interface()
                dbi.update_asQuery(query2)
                dbi.update_asQuery(query1)
                print('Update Mission :{mi} Success:server_database: UpdateTime和LatestBlog选项已更新'
                      .format(mi=mission_id))
                if user_list_str.__len__()>0:
                    query='update user_info_table set isGettingBlog=null where container_id in ({user_list});' \
                        .format(user_list=user_list_str)
                    dbi.update_asQuery(query)
                print('Update Mission :{mi} Success:erver_database: isGettingBlog选项已清除'.format(mi=mission_id))

            else:
                if user_list_str.__len__()>0:
                    query='update user_info_table set isGettingBlog=null where container_id in ({user_list});'\
                        .format(user_list=user_list_str)
                    dbi=MySQL_Interface()
                    dbi.update_asQuery(query)

            # 将assemble_factory中与当前任务有关数据清空
            assemble_table.remove({'container_id':mission_id})
            print('Update Mission :{mi} Success:server_database: assemble_factory in Mongo is cleared'
                  .format(mi=mission_id))

            # 将mongodb,任务列表中当前任务项清空
            mission_mongo.remove({'mission_id':mission_id})
            print('Update Mission :{mi} Success:server_database: this mission is cleared'
                  .format(mi=mission_id))
Ejemplo n.º 36
0
def test2():
    from DB_Interface import MySQL_Interface

    dbi = MySQL_Interface()
    [x, s] = dbi.select_all("ready_to_get")
    print(x)
    def get(self):
        global proxy
        uuid=str(self.get_argument('uuid'))
        task_id=self.task_assign(uuid)

        if proxy.get_ave_proxy_size()<30:   # check the size of current proxy size
            self.write('no task')
            self.finish()
            return

        if task_id==-1:       # checi if this uuid is valid
            self.write('no task')
            self.finish()
            return

        if task_id==1:         # get the social web of certain user
            dbi=MySQL_Interface()
            query='select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;'
            res=dbi.select_asQuery(query)
            if res.__len__()==0:
                self.write('no task')
                self.finish()
                return
            res=res[0]
            col_info=dbi.get_col_name('ready_to_get')
            uid=res[col_info.index('uid')]

            self.write('{uid},connect'.format(uid=uid))
            self.finish()

            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\
                .format(t_time=time_stick,uid=uid)
            dbi.update_asQuery(query)


        if task_id==2:      # get the history microblog of a certain user
            dbi=MySQL_Interface()
            query='select container_id,blog_num from user_info_table ' \
                  'where (isGettingBlog is null and update_time is null and blog_num<{valve} and blog_num>100)' \
                  'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE)
            # query='select container_id,blog_num from user_info_table ' \
            #       'order by rand() limit 1 ;'
            res=dbi.select_asQuery(query)
            if res.__len__()==0:
                self.write('no task')
                self.finish()
                return
            [container_id,blog_num]=res[0]
            self.write('{c_id};{blog},history'
                       .format(c_id=container_id,blog=blog_num))
            self.finish()
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\
                .format(t_time=time_stick,cid=container_id)
            dbi.update_asQuery(query)

        if task_id==3:      # get the history microblog of a certain user
            dbi=MySQL_Interface()
            query='select container_id,blog_num from user_info_table ' \
                  'where (isGettingBlog is null and update_time is null and blog_num>={valve}  and blog_num>100)' \
                  'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE)
            # query='select container_id,blog_num from user_info_table ' \
            #       'order by rand() limit 1 ;'
            [container_id,blog_num]=dbi.select_asQuery(query)[0]
            self.write('{c_id};{blog},history'
                       .format(c_id=container_id,blog=blog_num))
            self.finish()
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;" \
                .format(t_time=time_stick,cid=container_id)
            dbi.update_asQuery(query)

        if task_id==4 or task_id==5 or task_id==100:   # this part is in test
            dbi=MySQL_Interface()
            current_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            target_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()-60*60*24*1)) #提早5天
            if task_id==4:
                batch_size = 100
            elif task_id==5:
                batch_size = 200
            else:
                batch_size = 10
            query='select container_id,update_time,latest_blog from user_info_table ' \
                  'where update_time<\'{target_time}\' and isGettingBlog is null and blog_num>10 order by fans_num desc limit {batch}' \
                .format(target_time=target_time_stick,batch=batch_size)
            print(query)
            res=dbi.select_asQuery(query)

            # 将从mysql中取得的用户列表加上必要的变量以后发送给客户端
            res=[[line[0],int(time.mktime(line[1].timetuple())),int(time.mktime(line[2].timetuple()))] for line in res]
            res_cp=res

            if res_cp.__len__()==0:  # if no task ,then return "no task"
                print('*** warning: no avaliable update mission ***')
                self.write('no task')
                self.finish()
                return

            # print('debug from task handler')
            # pprint(res_cp)
            res=[line[0]+'-'+str(line[1])+'-'+str(line[2]) for line in res]
            inn=''
            for item in res:
                inn+=item+';'
            inn=inn[0:-1]
            # uid-stamp;uid-timestamp;...;,update  (the formation of order)
            mission_id=random_str(15)
            commend='{list};{task_id},update'.format(list=inn,task_id=mission_id)
            # 传送给客户端的指令格式: ContainerId-UpdateTime-LatestBlog;...;...;...,update
            self.write(commend)
            self.finish()

            # 将用户列表,任务id,以及任务开始时间存入mongodb
            u_list=[dict(container_id=x[0],update_time=x[1],latest_blog=x[2]) for x in res_cp]
            data_toMongo=dict(
                mission_id  =   mission_id,
                user_list   =   u_list,
                mission_start=  int(time.time())
            )
            client=MongoClient('localhost',27017)
            db=client['microblog_spider']
            collec=db.update_mission
            collec.insert(data_toMongo)

            # 将相关内容从mysql中设置isGettingBlog
            user_list_str=''
            for line in res_cp:
                user_list_str+='\'{cid}\','.format(cid=line[0])
            user_list_str=user_list_str[:-1]
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query='update user_info_table set isGettingBlog=\'{time}\' where container_id in ({ulist})'\
                .format(time=time_stick,ulist=user_list_str)
            dbi.update_asQuery(query)
Ejemplo n.º 38
0
 def __init__(self):
     threading.Thread.__init__(self)
     self.dbi = MySQL_Interface()
Ejemplo n.º 39
0
    def post(self):

        # 从客户端获取信息
        try:
            user_history=self.get_argument('user_history')
            latest_time=self.get_argument('latest_time')
            latest_timestamp=self.get_argument('latest_timestamp')
            container_id=self.get_argument('container_id')
            isDivided=self.get_argument('isDivided')
            user_history=eval(user_history)
            if isDivided==1 or isDivided=='1' :
                block_num=self.get_argument('block_num')
                current_block=self.get_argument('current_block')
            self.write('success to return user history')
            self.finish()
            print('Success: to get data from web')
        except Exception as e:
            self.write('fail to return user history')
            self.finish()
            print('Error:server-HistoryReturn:'
                  'Unable to get value from http package,Reason:')
            print(e)
            return


        # 连接
        try:
            dbi=MySQL_Interface()
        except:
            print('Error:server-HistoryReturn:'
                  'Unable to connect to MySQL')

        # 从MYSQL获取该用户相关信息
        try:
            query='select * from user_info_table where container_id=\'{cid}\''\
                .format(cid=container_id)
            user_info=dbi.select_asQuery(query)[0]
            col_name=dbi.get_col_name('user_info_table')
        except Exception as e:
            print('Error:server-HistoryReturn:'
                  'No such user in MySQL.user_info_table,Reason:')
            print(e)

        # 将数据存入Mongodb以后将相关信息存入mysql,并将isGettingBlog字段设为空
        try:
            blog_len=user_history.__len__()
            wanted_blog_len=user_info[col_name.index('blog_num')]
            blog_accuracy=blog_len/wanted_blog_len
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            if not user_info[col_name.index('update_time')]:
                save_data_inMongo(user_history)
                # query='update user_info_table set ' \
                #       'update_time=\'{up_time}\',' \
                #       'latest_blog=\'{latest_blog}\',' \
                #       'isGettingBlog=null ' \
                #       'where container_id=\'{cid}\';'\
                #     .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                query='update user_info_table set ' \
                      'update_time=\'{up_time}\',' \
                      'latest_blog=\'{latest_blog}\'' \
                      'where container_id=\'{cid}\';' \
                    .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                dbi.update_asQuery(query)
            else:
                query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\''\
                    .format(cid=container_id)
                dbi.update_asQuery(query)

            query='insert into accuracy_table values ({acc},\'{t_s}\') ;'\
                .format(acc=blog_accuracy,t_s=time_stick)
            dbi.insert_asQuery(query)

            print('Success: insert user into MongoDB, the num of data is {len}'
                  .format(len=blog_len))
        except Exception as e:
            print('Error:server-HistoryReturn:'
                  'Unable to update data in MySQL.user_info_tabe,Reason:')
            print(e)
Ejemplo n.º 40
0
    def post(self):

        try:
            user_basic_info=self.get_argument('user_basic_info')
            attends=self.get_argument('user_attends')
            user_basic_info=eval(user_basic_info)
            attends=eval(attends)
            self.write('success to return user info')
            self.finish()
        except:
            self.write('fail to return user info')
            self.finish()
            return

        try:
            dbi=MySQL_Interface()
        except:
            print('unable to connect to MySql DB')

        try:
            if attends.__len__()>0:           #store attends info
                table_name='cache_attends'
                attends_col_info=dbi.get_col_name(table_name)
                keys=attends[0].keys()
                attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends]
                fans_col_pos=attends_col_info.index('fans_num')
                insert_attends=[]
                for line in attends:
                    if line[fans_col_pos]>1000:
                        insert_attends.append(line)
                dbi.insert_asList(table_name,insert_attends,unique=True)
                print('Success : attends of {uid} is stored in {tname}'
                      .format(uid=user_basic_info['uid'],tname=table_name))
            else:
                pass
        except Exception as e:
            print(e)
            path="temp\\{uid}_attends.pkl".format(uid=user_basic_info['uid'])
            print('unable to store attends of {uid}, it will be stored '
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(attends,path)

        try:
            atten_num_real=user_basic_info['attends_num']
            atten_num_get=attends.__len__()
            user_basic_info['accuracy']=atten_num_get       # 实际获取到的关注数目
            col_info=dbi.get_col_name('cache_user_info')    # store user basic info
            keys=user_basic_info.keys()
            data=[user_basic_info[i] if i in keys else '' for i in col_info]
            dbi.insert_asList('cache_user_info',[data],unique=True)
            print('Success : basic info of {uid} is stored in cache_user_info'
                  .format(uid=user_basic_info['uid']))
        except Exception as e:
            print(e)
            path='temp\\{uid}_basic_info.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store basic info of {uid} , it will be stored'
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(user_basic_info,path)

        try:
            if attends.__len__()>0:            # store atten connection web
                from_uid=user_basic_info['uid']
                from_fans_num=user_basic_info['fans_num']
                from_blog_num=user_basic_info['blog_num']
                data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends]
                dbi.insert_asList('cache_atten_web',data)
                print('Success : conn web of {uid} is stored in cache_atten_web'
                      .format(uid=user_basic_info['uid']))
            else:
                pass
        except Exception as e:
            print(e)
            path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
            print('unable to store atten web of {uid} , it will be stored'
                  .format(uid=user_basic_info['uid']))
            FI.save_pickle(data,path)
Ejemplo n.º 41
0
    def run(self):
        while True:
            start_time = time.time()
            dbi = MySQL_Interface()
            col_info = dbi.get_col_name('cache_history')
            query = 'select * from cache_history where is_dealing is null order by checkin_timestamp limit 1'

            mysql_res = dbi.select_asQuery(query)
            if mysql_res.__len__() == 0:  # cache_history表为空时,睡眠1秒,跳过此次循环
                time.sleep(1)
                continue

            mysql_res = mysql_res[0]

            # todo for delete-----
            print('debug->start to deal with a new task')
            print('debug->mysql_res: ')
            print(mysql_res)
            #------------------------

            container_id = mysql_res[col_info.index('container_id')]
            print('debug->container_id: {cid}'.format(cid=container_id))
            latest_time = mysql_res[col_info.index('latest_time')]
            latest_timestamp = mysql_res[col_info.index('latest_timestamp')]
            time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                       time.localtime(time.time()))
            query = 'update cache_history set is_dealing=\'{time}\' where container_id={cid}'.format(
                time=time_stick, cid=container_id)
            # todo for delete-----
            print('debug->query1 : {q}'.format(q=query))
            # ------------------------
            dbi.update_asQuery(query)

            client = MongoClient('localhost', 27017)
            db = client['microblog_spider']
            assemble_table = db.assemble_factory
            res = assemble_table.find({'container_id': container_id}, {
                'current_id': 1,
                'total_num': 1
            })
            id_list = [x['current_id'] for x in res]
            num = int([
                x['total_num']
                for x in assemble_table.find({
                    'container_id': container_id
                }).limit(1)
            ][0])
            ## todo for delete-----
            print('debug->id_list_len: {len}'.format(len=id_list.__len__()))
            print('debug->num: {n}'.format(n=num))
            # ------------------------
            # 检查是否所有包裹已经到齐
            check_state = True
            if id_list.__len__() < num:
                print(
                    'server->HistoryReport:The package is not complete, retry to catch data'
                )
                check_state = False

            if check_state:
                # 如果所有子包已经收集完毕,则将数据放入正式数据库mongodb
                # 将装配车间中的相关数据删除
                # 并且在Mysql中更新update_time和latest_blog,抹掉isGettingBlog

                # 从mysql获取该用户信息
                try:
                    query = 'select * from user_info_table where container_id=\'{cid}\'' \
                        .format(cid=container_id)
                    user_info = dbi.select_asQuery(query)[0]
                    # todo fro debug-------------
                    print('task {cid} :debug->query2: {q}'.format(
                        q=query, cid=container_id))
                    print('task {cid} debug->user_info:'.format(
                        cid=container_id))
                    print(user_info)
                    # --------------------------------
                    col_name = dbi.get_col_name('user_info_table')
                except Exception as e:
                    print(
                        'task {cid} :Error:server-HistoryReturn:'
                        'No such user in MySQL.user_info_table,Reason:'.format(
                            cid=container_id))
                    print(e)

                # 将数据从assemble factory中提取出来
                try:
                    data_list = assemble_table.find(
                        {'container_id': container_id}, {
                            'data': 1,
                            'current_id': 1
                        })
                    data_list_ori = [x for x in data_list]
                    data_list = [x['data'] for x in data_list_ori]
                    id_list = [x['current_id'] for x in data_list_ori]
                    data_list_ori = None
                    # todo fro debug-------------
                    print('task {cid} debug->datalist: {len}'.format(
                        len=data_list.__len__(), cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print(
                        'Error:server-HistoryReturn:'
                        'Unable to get data from MongoDB, assemble factory,Reason:'
                    )
                    print(e)

                # 长度大于预期,说明有重复信息,需要去重
                if id_list.__len__() > num:
                    unique_data_list = []
                    check_dict = {}
                    for i in range(id_list.__len__()):
                        try:
                            # 这里使用字典去重,(算是hash吧)
                            check_dict[str(id_list[i])]
                            continue
                        except:
                            check_dict[str(id_list[i])] = True
                            unique_data_list.append(data_list[i])
                            # print('data_list.len :{len}'.format(len=data_list.__len__()))
                            # print('id_list.len :{len}'.format(len=id_list.__len__()))
                            # print(i)
                    data_list = unique_data_list

                # 将碎片拼接
                try:
                    data_final = []
                    for i in data_list:
                        data_final = data_final + i
                    # todo fro debug-------------
                    print('task {cid} debug->数据拼接完毕,len {len}'.format(
                        len=data_final.__len__(), cid=container_id))
                    # --------------------------------
                except Exception as e:
                    print(
                        'Error:server-HistoryReport:'
                        'Unable to contact the pieces of information,Reason:')
                    print(e)

                # 将本次信息录入accuracy_table 用以进一步分析
                blog_len = data_final.__len__()
                wanted_blog_len = user_info[col_name.index('blog_num')]
                blog_accuracy = blog_len / wanted_blog_len
                time_stick = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(time.time()))
                query = 'insert into accuracy_table values ({acc},\'{t_s}\',{num}) ;' \
                    .format(acc=blog_accuracy, t_s=time_stick, num=wanted_blog_len)
                dbi.insert_asQuery(query)

                # 将数据录入Mongodb 更改Mysql,删除assemble中相关内容
                try:
                    if not user_info[col_name.index('update_time')]:
                        # 将数据存入 Mongodb 的formal collection
                        save_data_seperately(data_final)
                        print(
                            'task {cid} Success: Data has saved in Mongodb, size is {size}'
                            .format(size=sys.getsizeof(data_final),
                                    cid=container_id))

                        # # 将关键信息录入Mydql
                        query = 'update user_info_table set ' \
                              'update_time=\'{up_time}\',' \
                              'latest_blog=\'{latest_blog}\',' \
                              'isGettingBlog=null ' \
                              'where container_id=\'{cid}\';'\
                            .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        # query='update user_info_table set ' \
                        #       'update_time=\'{up_time}\',' \
                        #       'latest_blog=\'{latest_blog}\'' \
                        #       'where container_id=\'{cid}\';' \
                        #     .format(up_time=time_stick,latest_blog=latest_time,cid=container_id)
                        #TODO 这里为了方便统计,去掉了抹除isGetting这一项,但是正式运行的时候是要加上的
                        dbi.update_asQuery(query)
                        print(
                            'task {cid} Success: insert user into MongoDB, the num of data is {len}'
                            .format(len=blog_len, cid=container_id))
                    else:
                        query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                            .format(cid=container_id)
                        dbi.update_asQuery(query)

                except Exception as e:
                    print('task {cid} Error:server->HistoryReport:'
                          'Reason:'.format(cid=container_id))
                    print(e)
            else:
                # 如果所有子包不全,则抹掉isGettingBlog,将装配车间中数据删除
                print(
                    'task {cid} :Error: the package is not complete ,{a} of {b}'
                    .format(a=id_list.__len__(), b=num, cid=container_id))
                query='update user_info_table set isGettingBlog=null where container_id=\'{cid}\'' \
                    .format(cid=container_id)
                dbi.update_asQuery(query)

            # 将数据从assemble factory 去掉
            assemble_table.remove({'container_id': container_id})
            print(
                'task {cid} Success: Data has been removed from assemble factory'
                .format(cid=container_id))

            # 将cache_history中的相应行删掉,表示已经处理完该事物了
            query='delete from cache_history where container_id=\'{cid}\'' \
                .format(cid=container_id)
            dbi.update_asQuery(query)

            end_time = time.time()
            deal_time = end_time - start_time
            print(
                'task {cid} :Success : the user {cid} is completed, length is {len}, use {t} seconds'
                .format(cid=container_id,
                        len=data_final.__len__(),
                        t=deal_time))
Ejemplo n.º 42
0
    def get(self):
        global proxy
        uuid=str(self.get_argument('uuid'))
        task_id=self.task_assign(uuid)

        if proxy.get_ave_proxy_size()<30:   # check the size of current proxy size
            self.write('no task')
            self.finish()
            return

        if task_id==-1:       # checi if this uuid is valid
            self.write('no task')
            self.finish()
            return

        if task_id==1:         # get the social web of certain user
            dbi=MySQL_Interface()
            query='select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;'
            res=dbi.select_asQuery(query)
            if res.__len__()==0:
                self.write('no task')
                self.finish()
                return
            res=res[0]
            col_info=dbi.get_col_name('ready_to_get')
            uid=res[col_info.index('uid')]

            self.write('{uid},connect'.format(uid=uid))
            self.finish()

            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\
                .format(t_time=time_stick,uid=uid)
            dbi.update_asQuery(query)

        if task_id==2:      # get the history microblog of a certain user
            dbi=MySQL_Interface()
            query='select container_id,blog_num from user_info_table ' \
                  'where (isGettingBlog is null and update_time is null) ' \
                  'order by fans_num desc limit 1 ;'
            # query='select container_id,blog_num from user_info_table ' \
            #       'order by rand() limit 1 ;'
            [container_id,blog_num]=dbi.select_asQuery(query)[0]
            self.write('{c_id};{blog},history'
                       .format(c_id=container_id,blog=blog_num))
            self.finish()
            time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\
                .format(t_time=time_stick,cid=container_id)
            dbi.update_asQuery(query)

        if task_id==3:   # this part is in test
            dbi=MySQL_Interface()
            current_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            target_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()-60*60*24)) #提早一天
            query='select uid,update_time from user_info_table ' \
                  'where update_time<\'{target_time}\' and isGettingBlog is null limit {batch}'\
                .format(target_time=target_time_stick,batch=10)
            res=dbi.select_asQuery(query)
            res=[[line[0],int(time.mktime(line[1].timetuple()))] for line in res]
            # res=[[line[0],int(time.mktime(time.strptime(str(line[1]),'%Y-%m-%d %H:%M:%S')))] for line in res]
            res=[line[0]+'-'+str(line[1]) for line in res]
            inn=''
            for item in res:
                inn+=item+';'
            inn=inn[0:-1]
            # uid-stamp;uid-timestamp;...;,update  (the formation of order)
            commend='{list},update'.format(list=inn)
            self.write(commend)
            self.finish()
Ejemplo n.º 43
0
__author__ = 'multiangle'

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from DB_Interface import MySQL_Interface

dbi=MySQL_Interface()
[select_user,select_user_col]=dbi.select_all('select_user')
user_list= [line[select_user_col.index('name')] for line in select_user]
user_id=[line[select_user_col.index('uid')] for line in select_user]
[atten_web,atten_web_col]=dbi.select_all('select_atten')
atten_list=[[line[atten_web_col.index('from_uid')],line[atten_web_col.index('to_uid')]] for line in atten_web]
# temp_atten_list=[]
# for line in atten_list:
#     try:
#         temp=[user_list[user_id.index(line[0])],user_list[user_id.index(line[1])]]
#         temp_atten_list.append(temp)
#     except:
#         pass
# atten_list=temp_atten_list

print(atten_list.__len__())
sig_list= [line[0]+line[1] for line in atten_list]
select_atten_list=[]
for line in atten_list:
    temp_sig_a=line[0]+line[1]
    temp_sig_b=line[1]+line[0]
    if temp_sig_a in sig_list and temp_sig_b in sig_list :
        select_atten_list.append(line)