Python Database Examples

Programming Language: Python

Namespace/Package Name: Dao

Class/Type: Database

Examples at hotexamples.com: 12

Python Database - 12 examples found. These are the top rated real world Python examples of Dao.Database extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Database(5)

close(5)

connect(5)

Example #1

Show file

File: Spider.py Project: Catfishly/SinaSpider

    def __init__(self, user_list, cookie_list=None):
        """

        :param user_list: a list of users
        :param cookie_list: a list of cookies, default to be None
        :return:
        """
        if cookie_list is not None:
            self.cookies = cookie_list
            self.fetchers = []
            self.get_fetchers_by_cookie()
            self.parser = HtmlParser()
        else:  # need login by users
            self.users = user_list
            self.fetchers = []
            self.get_fetchers_by_user()
            self.parser = HtmlParser()
            self.database = Database()

        self.main_fetcher = 0  # current fetcher index
        self.follower_list = []  # store followers
        self.followee_list = []  # store followees
        self.timeline_list = []  # store timelines
        self.profile_list = []  # store profiles
        self.start_time = datetime.now()
        self.end_time = None

Example #2

Show file

File: Spider.py Project: ChenghaoZHU/SinaSpider

    def set_user_deleted_by_uid(self, uid):
        db = Database()
        db.connect()

        cursor = db.session.query(Dao.Task).filter(Dao.Task.uid == uid).one()
        cursor.is_deleted = '1'

        db.close()

Example #3

Show file

File: Spider.py Project: ChenghaoZHU/SinaSpider

    def __init__(self, user_list, cookie_list=None):
        """

        :param user_list: a list of users
        :param cookie_list: a list of cookies, default to be None
        :return:
        """
        if cookie_list is not None:
            self.cookies = cookie_list
            self.fetchers = []
            self.get_fetchers_by_cookie()
            self.parser = HtmlParser()
        else: # need login by users
            self.users = user_list
            self.fetchers = []
            self.get_fetchers_by_user()
            self.parser = HtmlParser()
            self.database = Database()

        self.main_fetcher = 0 # current fetcher index
        self.follower_list = [] # store followers
        self.followee_list = [] # store followees
        self.timeline_list = [] # store timelines
        self.profile_list = [] # store profiles
        self.start_time = datetime.now()
        self.end_time = None

Example #4

Show file

File: CompleteCrawl.py Project: ChenghaoZHU/SinaSpider

def get_accounts(limit):
    '''

    :param limit:
    :return: user_list: a list of User objects denoting for Sina accounts
    '''
    db = Database()
    db.connect()

    user_list = []
    users = db.session.query(Account).filter(Account.is_available=='1', Account.is_deleted=='0').limit(limit)
    for u in users:
        u.is_available = '0'
        user_list.append(User(u.account, u.passwd))

    db.close()
    return user_list

Example #5

Show file

File: CompleteCrawl.py Project: ChenghaoZHU/SinaSpider

def get_tasks(limit):
    '''

    :param limit: specify how many tasks we are going to fetch from db
    :return: task_list: a list with uids
    '''

    db = Database()
    db.connect()

    task_list = []

    tasks = db.session.query(Task).filter(Task.is_available=='1', Task.is_deleted=='0').limit(limit)
    for t in tasks:
        t.is_available = '0'
        task_list.append(t.uid)

    db.close()

    return task_list

Example #6

Show file

File: Spider.py Project: Catfishly/SinaSpider

    def set_user_deleted_by_uid(self, uid):
        db = Database()
        db.connect()

        cursor = db.session.query(Dao.Task).filter(Dao.Task.uid == uid).one()
        cursor.is_deleted = '1'

        db.close()

Example #7

Show file

def get_accounts(limit):
    '''

    :param limit:
    :return: user_list: a list of User objects denoting for Sina accounts
    '''
    db = Database()
    db.connect()

    user_list = []
    users = db.session.query(Account).filter(
        Account.is_available == '1', Account.is_deleted == '0').limit(limit)
    for u in users:
        u.is_available = '0'
        user_list.append(User(u.account, u.passwd))

    db.close()
    return user_list

Example #8

Show file

def get_tasks(limit):
    '''

    :param limit: specify how many tasks we are going to fetch from db
    :return: task_list: a list with uids
    '''

    db = Database()
    db.connect()

    task_list = []

    tasks = db.session.query(Task).filter(Task.is_available == '1',
                                          Task.is_deleted == '0').limit(limit)
    for t in tasks:
        t.is_available = '0'
        task_list.append(t.uid)

    db.close()

    return task_list

Example #9

Show file

File: AddAccounts.py Project: DustinLi/SinaSpider-3

import os.path
from Dao import Account
from Dao import Database

if __name__ == '__main__':

    file_name = 'weibo_accounts.txt'
    if not os.path.isfile(file_name):
        print 'The file containing accounts must be named as "%s" !' % (
            file_name, )
        exit(-1)

    account_list = []

    with open(file_name, 'r') as reader:
        for line in reader:
            if line.strip('\r\n\t') == '':
                continue
            act = line.split('----')[0]
            pwd = line.split('----')[1].strip('\n')
            account = Account({'account': act, 'passwd': pwd})
            account_list.append(account)

    db = Database()
    db.connect()
    for act in account_list:
        db.session.merge(act)
    db.close()

    print 'Adding accounts finished!'

Example #10

Show file

File: AddAccounts.py Project: ChenghaoZHU/SinaSpider

import os.path
from Dao import Account
from Dao import Database

if __name__ == '__main__':

    file_name = 'weibo_accounts.txt'
    if not os.path.isfile(file_name):
        print 'The file containing accounts must be named as "%s" !' % (file_name, )
        exit(-1)

    account_list = []

    with open(file_name, 'r') as reader:
        for line in reader:
            if line.strip('\r\n\t') == '':
                continue
            act = line.split('----')[0]
            pwd = line.split('----')[1].strip('\n')
            account = Account({'account':act, 'passwd':pwd})
            account_list.append(account)

    db = Database()
    db.connect()
    for act in account_list:
        db.session.merge(act)
    db.close()

    print 'Adding accounts finished!'

Example #11

Show file

File: Spider.py Project: ChenghaoZHU/SinaSpider

class Spider(object):
    def __init__(self, user_list, cookie_list=None):
        """

        :param user_list: a list of users
        :param cookie_list: a list of cookies, default to be None
        :return:
        """
        if cookie_list is not None:
            self.cookies = cookie_list
            self.fetchers = []
            self.get_fetchers_by_cookie()
            self.parser = HtmlParser()
        else: # need login by users
            self.users = user_list
            self.fetchers = []
            self.get_fetchers_by_user()
            self.parser = HtmlParser()
            self.database = Database()

        self.main_fetcher = 0 # current fetcher index
        self.follower_list = [] # store followers
        self.followee_list = [] # store followees
        self.timeline_list = [] # store timelines
        self.profile_list = [] # store profiles
        self.start_time = datetime.now()
        self.end_time = None

    def delete_enterprise_user(self, uid):
        '''

        :param uid: to delete
        :return:
        '''

        Dao.Task.delete_user(uid)


    def ban_account(self):

        url = 'http://sass.weibo.com/unfreeze'
        html = open_url(self.fetchers[self.main_fetcher], url)
        is_exceptional = self.parser.is_exceptional(html)
        is_frozen = self.parser.is_frozen(html)
        if is_exceptional is False and is_frozen is False:
            return

        account = self.users[self.main_fetcher].acct
        Dao.Account.ban(account)
        emphasis_print('One account has been banned!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')

    def reset_account(self):

        account = self.users[self.main_fetcher]
        accounts = []
        accounts.append(account)
        Dao.Account.reset(accounts)
        emphasis_print('One account expires!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')

    def collect_user_information(self, uid):
        print 'Collecting information for User %s...' % (uid,)
        pid = self.get_pid(uid)
        if pid == '':
            print 'User does not exist!'
            self.set_user_deleted_by_uid(uid)
            return 404
        elif pid == -1:
            print 'This user is an enterprise user!'
            self.delete_enterprise_user(uid)
            return -1

        self.get_followers(pid)
        print 'Followers crawled.'
        self.get_followees(pid)
        print 'Followees crawled.'
        self.get_timelines(uid)
        print 'Timelines crawled.'
        self.get_profile(pid)
        print 'Pofile crawled.'

    def collect_user_profiles_only(self, uid):
        print 'Collecting profile for User %s...' % (uid,)
        pid = self.get_pid(uid)
        if pid == '':
            print 'User does not exist!'
            self.set_user_deleted_by_uid(uid)
            return 404
        self.get_profile(pid)
        print 'Profile crawled.'

    def get_fetchers_by_user(self):
        """
        initialize self.fetchers by user
        :return:
        """
        wb = Weibo()
        for user in self.users:
            fetcher = wb.login(user)
            if fetcher is not None:
                emphasis_print('User: %s login success!' % (user.acct,))
                self.fetchers.append(fetcher)
            else:
                emphasis_print('User: %s login failure!' % (user.acct,))
        print 'Get all fetchers by users!\n'
    def get_fetchers_by_cookie(self):
        """
        initialize self.fetchers by cookie
        :return:
        """
        pass

    def get_pid(self, uid):
        """

        :param uid:
        :return: corresponding pid
        """
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/%s' % (uid,)
        while True:
            html = open_url(fetcher, url)

            parsed_uid = self.parser.parse_uid(html)
            if parsed_uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue # make sure that the html is correct.

            is_enterprise = self.parser.parse_is_enterprise(html)
            if is_enterprise is True:
                return -1 # -1 denotes this user is an enterprise

            pid = self.parser.parse_pid(html)
            if pid is not None:
                return pid
            else:
                log.error('Cannot get pid for uid:%s' % (uid,))
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))


    def get_followers(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?relate=fans&from=' + pid[:6] + '&wvr=6&mod=headfans#place'
        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fer_page_num = self.get_follower_page_num(html)
            if fer_page_num is not None:
                break
            else:
                log.warning('Cannot get total follower page number for pid:%s' % (pid,))
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))

        if fer_page_num == 0:
            print 'He/She does not have any followers.'
            return
        else:
            print 'Getting follower page 1 of %d...' % (fer_page_num,)
            followers = self.parser.parse_followers(html, pid, datetime.now())
            self.follower_list.extend(followers) # followers cannot be None since it's been tested in self.get_follower_page_num(html)-> self.parser.parse_follower_page_num(html)
            if fer_page_num == 1:
                return
            for i in xrange(2, fer_page_num+1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?relate=fans&from=%s&wvr=6&mod=headfans&page=%d#place' % (pid, pid[:6], i)
                    print 'Getting follower page %d of %d...' % (i, fer_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                    followers = self.parser.parse_followers(html, pid, datetime.now())
                    if followers is None: # dirty html
                        log.warning('Cannot parse follower page - pid:%s, page num:%d' % (pid, i))
                        time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.follower_list.extend(followers)
                    break
    def get_follower_page_num(self, html):
        """
        get the number of follower pages, up to 5.
        :param html:
        :return:
        """
        fer_pnum = self.parser.parse_follower_page_num(html)
        if fer_pnum is None:
            return None
        if fer_pnum > 6:
            fer_pnum = 5
        return fer_pnum
    def get_followees(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:6] + '&wvr=6&mod=headfollow#place'

        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fee_page_num = self.get_followee_page_num(html)
            if fee_page_num is not None:
                break
            else:
                log.warning('Cannot get followee page total number - pid:%s' % (pid,))
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))

        if fee_page_num == 0:
            print 'He/She does not follow any one.'
            return
        else:
            print 'Getting followee page 1 of %d...' % (fee_page_num,)
            followees = self.parser.parse_followees(html, pid, datetime.now())
            self.followee_list.extend(followees) # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html)
            if fee_page_num == 1:
                return
            for i in xrange(2, fee_page_num+1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (pid, pid[:6], i)
                    print 'Getting followee page %d of %d...' % (i, fee_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                    followees = self.parser.parse_followees(html, pid, datetime.now())
                    if followees is None: # dirty html
                        log.warning('Cannot parse followee page correctly - pid:%s' % (pid,))
                        time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.followee_list.extend(followees)
                    break
    def get_followee_page_num(self, html):
        """
        get the number of followee pates, no value more than five
        :param html:
        :return:
        """
        fee_pnum = self.parser.parse_followee_page_num(html)
        if fee_pnum is None:
            return None
        if fee_pnum > 6:
            fee_pnum = 5
        return fee_pnum
    def get_timelines(self, uid):
        """
        get all timelines of user with this uid
        :param uid:
        :return:
        """
        fetcher = self.fetchers[self.main_fetcher]

        timeline_page_num, first_page = self.get_timeline_page_num(uid)
        if timeline_page_num == 0:
            print 'No any posts.'
            return
        else:
            for pt in first_page:
                self.timeline_list.extend(self.parser.parse_timelines(pt, uid, datetime.now()))
            if timeline_page_num == 1:
                print 'He/She just has one page timeline.'
                return

        timelines = []
        for pnum in xrange(2, timeline_page_num+1):
            print 'There are totally %d timeline pages.' % (timeline_page_num,)
            for bnum in xrange(3):
                html = self.fetch_timelines_by_page_bar(uid, pnum, bnum)
                time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                if html is not None:
                    timelines = self.parser.parse_timelines(html, uid, datetime.now())
                    self.timeline_list.extend(timelines)
            self.end_time = datetime.now()
            duration = self.end_time - self.start_time
            if duration.seconds > Config.ACCOUNT_CHANGE_TIME:
                self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers))
                self.start_time = datetime.now()
                emphasis_print('Account changed!!!')
                emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher+1, len(self.fetchers)))
            time.sleep(random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2*Config.SLEEP_BETWEEN_TIMELINE_PAGES))

    def fetch_timelines_by_page_bar(self, uid, pnum, bnum):
        """
        fetch timelines by specifying page number and bar number
        :param uid:
        :param pnum: page number
        :param bnum: bar number
        :return: html containing timelines or None if there are no timelines
        """
        body = { # 这个是有抓包得出的，因为新浪微博用了瀑布流动态加载，所以不能一次性得到一页中所有信息
            '__rnd':1343647638078,
            '_k':1343647471134109,
            '_t':0,
            'count':15,
            'end_id':3473519214542343,
            'max_id':3473279479126179,
            'page':1,
            'pagebar':1,
            'pre_page':1,
            'uid':uid
       }

        body['page'] = pnum

        if bnum == 0:
            body['count'] = '50'
            body['pagebar'] = ''
            body['pre_page'] = pnum-1
        elif bnum == 1:
            body['count'] = '15'
            body['pagebar'] = '0'
            body['pre_page'] = pnum
        elif bnum == 2:
            body['count'] = '15'
            body['pagebar'] = '1'
            body['pre_page'] = pnum

        url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body)
        while True:
            try:
                print 'Getting timeline page %d part %d...' % (pnum, bnum+1) # bnum starts with zero up to two
                jsn_data = open_url(self.fetchers[self.main_fetcher], url)
                if self.parser.is_frozen(jsn_data):
                    self.ban_account()
                    continue

                data = json.loads(jsn_data)
                html = data['data']
                if u'WB_feed_type SW_fun S_line2' in html:
                    return html
                else:
                    return None
            except Exception as e:
                if 'No valid account!' in e.message:
                    raise e
                if 'No JSON object could be decoded' in e.message:
                    if self.parser.is_visitor(jsn_data) is True:
                        self.reset_account()
                    else:
                        self.ban_account()
                log.warning(e.message)
                time.sleep(random.randint(Config.SLEEP_WHEN_EXCEPTION, 2*Config.SLEEP_WHEN_EXCEPTION))
                continue
    def get_timeline_page_num(self, uid):
        """

        :param uid:
        :return: page number and one or two pages, which will decrease accesses to Sina server
        """
        htmls = [] # keep the pages to decrease accesses to Sina
        while True:
            first_page_head = self.fetch_timelines_by_page_bar(uid, 1, 0)
            if first_page_head is None: # no any posts
                return 0, htmls
            else:
                htmls.append(first_page_head)

            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

            first_page_body = self.fetch_timelines_by_page_bar(uid, 1, 1)
            if first_page_body is None:
                return 1, htmls
            else:
                htmls.append(first_page_body)

            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

            first_page_tail = self.fetch_timelines_by_page_bar(uid, 1, 2)
            if first_page_tail is None: # just one page of timelines
                return 1, htmls
            else:
                htmls.append(first_page_tail)

            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

            pnum = self.parser.parse_timeline_page_num(first_page_tail) # this page number is not accurate, so we will recount it in the next step
            if pnum is None or pnum == 1:
                return 1, htmls

            while True:
                url = 'http://www.weibo.com/%s?page=%d&pids=Pl_Content_HomeFeed' % (uid, pnum)
                test_html = open_url(self.fetchers[self.main_fetcher], url)
                time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))
                no_post = 'W_icon icon_warnB'
                if no_post in test_html:
                    pnum -= 1 # fixing page number
                else:
                    break
            return pnum, htmls
    def get_profile(self, pid):
        '''
        get profile information for User marked with pid
        :param pid: page id
        :return:
        '''

        url = 'http://www.weibo.com/p/%s/info?mod=pedit_more' % (pid,)

        uid = pid[6:]
        is_taobao = None
        while is_taobao is None:
            try:

                is_taobao = self.is_taobao(uid) # get taobao information in advance
                if is_taobao == -1:
                    self.ban_account()
                    is_taobao = None
                elif is_taobao == -2:
                    self.reset_account()
                    is_taobao = None

            except Exception as e:
                print e.message
            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

        profile = None
        print 'Getting profile page...'
        while profile is None:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            if self.parser.parse_uid(html) == -1: # if -1 that means this working account maybe be banned
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True: # judge whether working account falls into visitor status
                self.reset_account()
                continue

            profile = self.parser.parse_profile(html, pid, is_taobao, datetime.now())
            time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES))

        self.profile_list.append(profile)

    def is_taobao(self, uid):
        '''

        :param uid: user ID
        :return: a boolean value ('1' or '0') indicating whether user is a taobao shopkeeper
        '''
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/' + uid
        html = open_url(fetcher, url)

        with open('debug_taobao.txt', 'w') as writer:
            writer.write(html)

        if self.parser.parse_uid(html) == -1:
            return -1 # account is banned
        elif self.parser.is_visitor(html) is True:
            return -2 # account is in visitor status

        return self.parser.parse_is_taobao(html)

    def save(self):
        '''
        save crawled information to DB
        :return:
        '''

        self.transformation()
        self.clear_null_data() # this function must be called after self.transformation

        self.database.connect()

        for fee in self.followee_list:
            self.database.session.merge(Dao.Followee(fee))
        for fer in self.follower_list:
            self.database.session.merge(Dao.Follower(fer))
        for tl in self.timeline_list:
            try:
                tl['text'] = tl['text'].replace('', ' ') #  is /001, so it's necessary to eliminate it
                tl['text'] = tl['text'].replace('\r', ' ').replace('\n', ' ') # remove all the linefeed
            except Exception as e:
                print e.message
            self.database.session.merge(Dao.Timeline(tl))
        for pf in self.profile_list:
            for jb in pf['Job']:
                self.database.session.merge(Dao.Job(jb))
            for edu in pf['Education']:
                self.database.session.merge(Dao.Education(edu))

            del pf['Job']
            del pf['Education']

            self.database.session.merge(Dao.User(pf))

        self.clear_buffer()


        self.database.close()

    def save_only_profile(self):

        self.transformation()
        self.clear_null_data() # this function must be called after self.transformation

        self.database.connect()

        for pf in self.profile_list:
            for jb in pf['Job']:
                self.database.session.merge(Dao.Job(jb))
            for edu in pf['Education']:
                self.database.session.merge(Dao.Education(edu))

            del pf['Job']
            del pf['Education']

            self.database.session.merge(Dao.User(pf))

        self.clear_buffer()


        self.database.close()


    def transformation(self):
        '''
        transfer keys of self.followee_list, self.follower_list, self.timeline_list and self.profile_list to fit field names in database
        :return:
        '''

        for fee in self.followee_list:
            fee['uid'] = fee.pop('uid')
            fee['fee_uid'] = fee.pop('fee_uid')
            fee['fee_name'] = fee.pop('name')
            fee['fee_profile_img_url'] = fee.pop('profile_img')
            fee['fee_description'] = fee.pop('description')
            fee['fee_gender'] = fee.pop('gender')
            fee['fee_location'] = fee.pop('location')
            fee['fee_by'] = fee.pop('app_source')
            fee['fee_followee_num'] = fee.pop('followee_num')
            fee['fee_follower_num'] = fee.pop('follower_num')
            fee['fee_weibo_num'] = fee.pop('weibo_num')
            fee['fee_verified_type'] = fee.pop('verified_type')
            fee['fee_is_vip'] = fee.pop('is_vip')
            fee['fee_vip_level'] = fee.pop('vip_level')
            fee['fee_is_daren'] = fee.pop('is_daren')
            fee['fee_is_taobao'] = fee.pop('is_taobao')
            fee['fee_is_suishoupai'] = fee.pop('is_suishoupai')
            fee['fee_is_vlady'] = fee.pop('is_vlady')
            fee['fee_timestamp'] = fee.pop('timestamp')

        for fer in self.follower_list:
            fer['uid'] = fer.pop('uid')
            fer['fer_uid'] = fer.pop('fer_uid')
            fer['fer_name'] = fer.pop('name')
            fer['fer_profile_img_url'] = fer.pop('profile_img')
            fer['fer_description'] = fer.pop('description')
            fer['fer_gender'] = fer.pop('gender')
            fer['fer_location'] = fer.pop('location')
            fer['fer_by'] = fer.pop('app_source')
            fer['fer_followee_num'] = fer.pop('followee_num')
            fer['fer_follower_num'] = fer.pop('follower_num')
            fer['fer_weibo_num'] = fer.pop('weibo_num')
            fer['fer_verified_type'] = fer.pop('verified_type')
            fer['fer_is_vip'] = fer.pop('is_vip')
            fer['fer_vip_level'] = fer.pop('vip_level')
            fer['fer_is_daren'] = fer.pop('is_daren')
            fer['fer_is_taobao'] = fer.pop('is_taobao')
            fer['fer_is_suishoupai'] = fer.pop('is_suishoupai')
            fer['fer_is_vlady'] = fer.pop('is_vlady')
            fer['fer_timestamp'] = fer.pop('timestamp')

        for tl in self.timeline_list:
            tl['mid'] = tl.pop('mid')
            tl['encrypt_mid'] = tl.pop('encrypted_mid')
            tl['uid'] = tl.pop('uid')
            tl['retweet_num'] = tl.pop('retweet')
            tl['comment_num'] = tl.pop('comment')
            tl['favourite_num'] = tl.pop('favourite')
            tl['created_at'] = tl.pop('created_at')
            tl['from'] = tl.pop('app_source')
            tl['text'] = tl.pop('text')
            tl['entity'] = tl.pop('entity')
            tl['source_mid'] = tl.pop('source_mid')
            tl['source_uid'] = tl.pop('source_uid')
            tl['mentions'] = tl.pop('mentions')
            tl['check_in'] = tl.pop('check_in')
            tl['check_in_url'] = tl.pop('check_in_url')
            tl['is_deleted'] = tl.pop('is_deleted')
            tl['timestamp'] = tl.pop('timestamp')

        for pf in self.profile_list:
            for jb in pf['Job']:
                jb['uid'] = pf['uid']
                jb['timestamp'] = pf['timestamp']

                jb['company'] = jb.pop('company')
                jb['location'] = jb.pop('location')
                jb['occupation'] = jb.pop('occupation')
                jb['time_period'] = jb.pop('period')

            for edu in pf['Education']:
                edu['uid'] = pf['uid']
                edu['timestamp'] = pf['timestamp']

                edu['school_name'] = edu.pop('university')
                edu['time_period'] = edu.pop('period')
                edu['department'] = edu.pop('department')
                edu['type'] = edu.pop('type')


            pf['uid'] = pf.pop('uid')
            pf['screen_name'] = pf.pop('nickname')
            pf['real_name'] = pf.pop('name')
            pf['location'] = pf.pop('location')
            pf['gender'] = pf.pop('gender')
            pf['sexual_orientation'] = pf.pop('sexual_orientation')
            pf['relationship_status'] = pf.pop('relationship_status')
            pf['birthday'] = pf.pop('birthday')
            pf['blood_type'] = pf.pop('blood_type')
            pf['blog'] = pf.pop('blog')
            pf['description'] = pf.pop('description')
            pf['email'] = pf.pop('email')
            pf['QQ'] = pf.pop('QQ')
            pf['MSN'] = pf.pop('MSN')
            pf['tag'] = pf.pop('tag')
            pf['followee_num'] = pf.pop('followee_num')
            pf['follower_num'] = pf.pop('follower_num')
            pf['weibo_num'] = pf.pop('weibo_num')
            pf['created_at'] = pf.pop('created_at')
            pf['profile_img_url'] = pf.pop('profile_img')
            pf['domain_id'] = pf.pop('domain_id')
            pf['domain_name'] = pf.pop('domain_name')
            pf['level'] = pf.pop('level')
            pf['experience'] = pf.pop('experience')
            pf['credit_level'] = pf.pop('credit_level')
            pf['credit_point'] = pf.pop('credit_point')
            pf['credit_history'] = pf.pop('credit_history')
            pf['is_vip'] = pf.pop('is_vip')
            pf['vip_level'] = pf.pop('vip_level')
            pf['is_yearly_pay'] = pf.pop('is_yearly_paid')
            pf['is_verified'] = pf.pop('is_verified')
            pf['verified_reason'] = pf.pop('verified_reason')
            pf['is_daren'] = pf.pop('is_daren')
            pf['daren_type'] = pf.pop('daren_type')
            pf['daren_point'] = pf.pop('daren_point')
            pf['daren_interest'] = pf.pop('daren_interest')
            pf['is_taobao'] = pf.pop('is_taobao')
            pf['not_exist'] = pf.pop('not_exist')
            pf['timestamp'] = pf.pop('timestamp')
    def clear_buffer(self):
        '''
        clear memory buffer after storing the information
        :return:
        '''

        self.followee_list = []
        self.follower_list = []
        self.timeline_list = []
        self.profile_list = []
    def clear_null_data(self):
        '''
        clear empty or None data for all information
        :return:
        '''
        followee_list = []
        follower_list = []
        timeline_list = []
        profile_list = []

        for fee in self.followee_list:
            dict = {}
            for key in fee:
                if fee[key] is None or fee[key] == '':
                    continue
                else:
                    dict[key] = fee[key]
            followee_list.append(dict)
        self.followee_list = followee_list

        for fer in self.follower_list:
            dict = {}
            for key in fer:
                if fer[key] is None or fer[key] == '':
                    continue
                else:
                    dict[key] = fer[key]
            follower_list.append(dict)
        self.follower_list = follower_list

        for tl in self.timeline_list:
            dict = {}
            for key in tl:
                if tl[key] is None or tl[key] == '':
                    continue
                else:
                    dict[key] = tl[key]
            timeline_list.append(dict)
        self.timeline_list = timeline_list


        for pf in self.profile_list:
            dict = {}
            for key in pf:
               if pf[key] is None or pf[key] == '':
                   continue
               else:
                   dict[key] = pf[key]
            profile_list.append(dict)
        self.profile_list = profile_list


        for pf in self.profile_list:
            job_list = []
            edu_list = []
            for job in pf['Job']:
                dict = {}
                for key in job:
                    if job[key] is None or job[key] == '':
                        continue
                    else:
                        dict[key] = job[key]
                job_list.append(dict)
            pf['Job'] = job_list

            for edu in pf['Education']:
                dict = {}
                for key in edu:
                    if edu[key] is None or edu[key] == '':
                        continue
                    else:
                        dict[key] = edu[key]
                edu_list.append(dict)
            pf['Education'] = edu_list



    def set_user_deleted_by_uid(self, uid):
        db = Database()
        db.connect()

        cursor = db.session.query(Dao.Task).filter(Dao.Task.uid == uid).one()
        cursor.is_deleted = '1'

        db.close()

Example #12

Show file

File: Spider.py Project: Catfishly/SinaSpider

class Spider(object):
    def __init__(self, user_list, cookie_list=None):
        """

        :param user_list: a list of users
        :param cookie_list: a list of cookies, default to be None
        :return:
        """
        if cookie_list is not None:
            self.cookies = cookie_list
            self.fetchers = []
            self.get_fetchers_by_cookie()
            self.parser = HtmlParser()
        else:  # need login by users
            self.users = user_list
            self.fetchers = []
            self.get_fetchers_by_user()
            self.parser = HtmlParser()
            self.database = Database()

        self.main_fetcher = 0  # current fetcher index
        self.follower_list = []  # store followers
        self.followee_list = []  # store followees
        self.timeline_list = []  # store timelines
        self.profile_list = []  # store profiles
        self.start_time = datetime.now()
        self.end_time = None

    def delete_enterprise_user(self, uid):
        '''

        :param uid: to delete
        :return:
        '''

        Dao.Task.delete_user(uid)

    def ban_account(self):

        url = 'http://sass.weibo.com/unfreeze'
        html = open_url(self.fetchers[self.main_fetcher], url)
        is_exceptional = self.parser.is_exceptional(html)
        is_frozen = self.parser.is_frozen(html)
        if is_exceptional is False and is_frozen is False:
            return

        account = self.users[self.main_fetcher].acct
        Dao.Account.ban(account)
        emphasis_print('One account has been banned!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')

    def reset_account(self):

        account = self.users[self.main_fetcher]
        accounts = []
        accounts.append(account)
        Dao.Account.reset(accounts)
        emphasis_print('One account expires!!!')

        self.users.pop(self.main_fetcher)
        self.fetchers.pop(self.main_fetcher)
        if self.main_fetcher == len(self.fetchers):
            self.main_fetcher = 0

        if len(self.fetchers) == 0:
            raise Exception('No valid account!')

    def collect_user_information(self, uid):
        print 'Collecting information for User %s...' % (uid, )
        pid = self.get_pid(uid)
        if pid == '':
            print 'User does not exist!'
            self.set_user_deleted_by_uid(uid)
            return 404
        elif pid == -1:
            print 'This user is an enterprise user!'
            self.delete_enterprise_user(uid)
            return -1

        self.get_followers(pid)
        print 'Followers crawled.'
        self.get_followees(pid)
        print 'Followees crawled.'
        self.get_timelines(uid)
        print 'Timelines crawled.'
        self.get_profile(pid)
        print 'Pofile crawled.'

    def collect_user_profiles_only(self, uid):
        print 'Collecting profile for User %s...' % (uid, )
        pid = self.get_pid(uid)
        if pid == '':
            print 'User does not exist!'
            self.set_user_deleted_by_uid(uid)
            return 404
        self.get_profile(pid)
        print 'Profile crawled.'

    def get_fetchers_by_user(self):
        """
        initialize self.fetchers by user
        :return:
        """
        wb = Weibo()
        for user in self.users:
            fetcher = wb.login(user)
            if fetcher is not None:
                emphasis_print('User: %s login success!' % (user.acct, ))
                self.fetchers.append(fetcher)
            else:
                emphasis_print('User: %s login failure!' % (user.acct, ))
        print 'Get all fetchers by users!\n'

    def get_fetchers_by_cookie(self):
        """
        initialize self.fetchers by cookie
        :return:
        """
        pass

    def get_pid(self, uid):
        """

        :param uid:
        :return: corresponding pid
        """
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/%s' % (uid, )
        while True:
            html = open_url(fetcher, url)

            parsed_uid = self.parser.parse_uid(html)
            if parsed_uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue  # make sure that the html is correct.

            is_enterprise = self.parser.parse_is_enterprise(html)
            if is_enterprise is True:
                return -1  # -1 denotes this user is an enterprise

            pid = self.parser.parse_pid(html)
            if pid is not None:
                return pid
            else:
                log.error('Cannot get pid for uid:%s' % (uid, ))
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))

    def get_followers(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?relate=fans&from=' + pid[:
                                                                                  6] + '&wvr=6&mod=headfans#place'
        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fer_page_num = self.get_follower_page_num(html)
            if fer_page_num is not None:
                break
            else:
                log.warning(
                    'Cannot get total follower page number for pid:%s' %
                    (pid, ))
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))

        if fer_page_num == 0:
            print 'He/She does not have any followers.'
            return
        else:
            print 'Getting follower page 1 of %d...' % (fer_page_num, )
            followers = self.parser.parse_followers(html, pid, datetime.now())
            self.follower_list.extend(
                followers
            )  # followers cannot be None since it's been tested in self.get_follower_page_num(html)-> self.parser.parse_follower_page_num(html)
            if fer_page_num == 1:
                return
            for i in xrange(2, fer_page_num + 1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?relate=fans&from=%s&wvr=6&mod=headfans&page=%d#place' % (
                        pid, pid[:6], i)
                    print 'Getting follower page %d of %d...' % (i,
                                                                 fer_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(
                        random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                       2 * Config.SLEEP_BETWEEN_2FPAGES))
                    followers = self.parser.parse_followers(
                        html, pid, datetime.now())
                    if followers is None:  # dirty html
                        log.warning(
                            'Cannot parse follower page - pid:%s, page num:%d'
                            % (pid, i))
                        time.sleep(
                            random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                           2 * Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.follower_list.extend(followers)
                    break

    def get_follower_page_num(self, html):
        """
        get the number of follower pages, up to 5.
        :param html:
        :return:
        """
        fer_pnum = self.parser.parse_follower_page_num(html)
        if fer_pnum is None:
            return None
        if fer_pnum > 6:
            fer_pnum = 5
        return fer_pnum

    def get_followees(self, pid):

        url = 'http://www.weibo.com/p/' + pid + '/follow?from=page_' + pid[:
                                                                           6] + '&wvr=6&mod=headfollow#place'

        while True:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            uid = self.parser.parse_uid(html)
            if uid == -1:
                self.ban_account()
                continue
            elif self.parser.is_visitor(html) is True:
                self.reset_account()
                continue

            fee_page_num = self.get_followee_page_num(html)
            if fee_page_num is not None:
                break
            else:
                log.warning('Cannot get followee page total number - pid:%s' %
                            (pid, ))
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))

        if fee_page_num == 0:
            print 'He/She does not follow any one.'
            return
        else:
            print 'Getting followee page 1 of %d...' % (fee_page_num, )
            followees = self.parser.parse_followees(html, pid, datetime.now())
            self.followee_list.extend(
                followees
            )  # followees cannot be None since it's been tested in self.get_followee_page_num(html)-> self.parser.parse_followee_page_num(html)
            if fee_page_num == 1:
                return
            for i in xrange(2, fee_page_num + 1):
                while True:
                    url = 'http://www.weibo.com/p/%s/follow?from=page_%s&wvr=6&mod=headfollow&page=%d#place' % (
                        pid, pid[:6], i)
                    print 'Getting followee page %d of %d...' % (i,
                                                                 fee_page_num)
                    html = open_url(fetcher, url)
                    time.sleep(
                        random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                       2 * Config.SLEEP_BETWEEN_2FPAGES))
                    followees = self.parser.parse_followees(
                        html, pid, datetime.now())
                    if followees is None:  # dirty html
                        log.warning(
                            'Cannot parse followee page correctly - pid:%s' %
                            (pid, ))
                        time.sleep(
                            random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                           2 * Config.SLEEP_WHEN_EXCEPTION))
                        continue
                    self.followee_list.extend(followees)
                    break

    def get_followee_page_num(self, html):
        """
        get the number of followee pates, no value more than five
        :param html:
        :return:
        """
        fee_pnum = self.parser.parse_followee_page_num(html)
        if fee_pnum is None:
            return None
        if fee_pnum > 6:
            fee_pnum = 5
        return fee_pnum

    def get_timelines(self, uid):
        """
        get all timelines of user with this uid
        :param uid:
        :return:
        """
        fetcher = self.fetchers[self.main_fetcher]

        timeline_page_num, first_page = self.get_timeline_page_num(uid)
        if timeline_page_num == 0:
            print 'No any posts.'
            return
        else:
            for pt in first_page:
                self.timeline_list.extend(
                    self.parser.parse_timelines(pt, uid, datetime.now()))
            if timeline_page_num == 1:
                print 'He/She just has one page timeline.'
                return

        timelines = []
        for pnum in xrange(2, timeline_page_num + 1):
            print 'There are totally %d timeline pages.' % (
                timeline_page_num, )
            for bnum in xrange(3):
                html = self.fetch_timelines_by_page_bar(uid, pnum, bnum)
                time.sleep(
                    random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                   2 * Config.SLEEP_BETWEEN_2FPAGES))
                if html is not None:
                    timelines = self.parser.parse_timelines(
                        html, uid, datetime.now())
                    self.timeline_list.extend(timelines)
            self.end_time = datetime.now()
            duration = self.end_time - self.start_time
            if duration.seconds > Config.ACCOUNT_CHANGE_TIME:
                self.main_fetcher = loop_increase(self.main_fetcher,
                                                  len(self.fetchers))
                self.start_time = datetime.now()
                emphasis_print('Account changed!!!')
                emphasis_print('Now %d of %d accounts are working!' %
                               (self.main_fetcher + 1, len(self.fetchers)))
            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES,
                               2 * Config.SLEEP_BETWEEN_TIMELINE_PAGES))

    def fetch_timelines_by_page_bar(self, uid, pnum, bnum):
        """
        fetch timelines by specifying page number and bar number
        :param uid:
        :param pnum: page number
        :param bnum: bar number
        :return: html containing timelines or None if there are no timelines
        """
        body = {  # 这个是有抓包得出的，因为新浪微博用了瀑布流动态加载，所以不能一次性得到一页中所有信息
            '__rnd': 1343647638078,
            '_k': 1343647471134109,
            '_t': 0,
            'count': 15,
            'end_id': 3473519214542343,
            'max_id': 3473279479126179,
            'page': 1,
            'pagebar': 1,
            'pre_page': 1,
            'uid': uid
        }

        body['page'] = pnum

        if bnum == 0:
            body['count'] = '50'
            body['pagebar'] = ''
            body['pre_page'] = pnum - 1
        elif bnum == 1:
            body['count'] = '15'
            body['pagebar'] = '0'
            body['pre_page'] = pnum
        elif bnum == 2:
            body['count'] = '15'
            body['pagebar'] = '1'
            body['pre_page'] = pnum

        url = 'http://weibo.com/aj/mblog/mbloglist?' + urllib.urlencode(body)
        while True:
            try:
                print 'Getting timeline page %d part %d...' % (
                    pnum, bnum + 1)  # bnum starts with zero up to two
                jsn_data = open_url(self.fetchers[self.main_fetcher], url)
                if self.parser.is_frozen(jsn_data):
                    self.ban_account()
                    continue

                data = json.loads(jsn_data)
                html = data['data']
                if u'WB_feed_type SW_fun S_line2' in html:
                    return html
                else:
                    return None
            except Exception as e:
                if 'No valid account!' in e.message:
                    raise e
                if 'No JSON object could be decoded' in e.message:
                    if self.parser.is_visitor(jsn_data) is True:
                        self.reset_account()
                    else:
                        self.ban_account()
                log.warning(e.message)
                time.sleep(
                    random.randint(Config.SLEEP_WHEN_EXCEPTION,
                                   2 * Config.SLEEP_WHEN_EXCEPTION))
                continue

    def get_timeline_page_num(self, uid):
        """

        :param uid:
        :return: page number and one or two pages, which will decrease accesses to Sina server
        """
        htmls = []  # keep the pages to decrease accesses to Sina
        while True:
            first_page_head = self.fetch_timelines_by_page_bar(uid, 1, 0)
            if first_page_head is None:  # no any posts
                return 0, htmls
            else:
                htmls.append(first_page_head)

            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

            first_page_body = self.fetch_timelines_by_page_bar(uid, 1, 1)
            if first_page_body is None:
                return 1, htmls
            else:
                htmls.append(first_page_body)

            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

            first_page_tail = self.fetch_timelines_by_page_bar(uid, 1, 2)
            if first_page_tail is None:  # just one page of timelines
                return 1, htmls
            else:
                htmls.append(first_page_tail)

            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

            pnum = self.parser.parse_timeline_page_num(
                first_page_tail
            )  # this page number is not accurate, so we will recount it in the next step
            if pnum is None or pnum == 1:
                return 1, htmls

            while True:
                url = 'http://www.weibo.com/%s?page=%d&pids=Pl_Content_HomeFeed' % (
                    uid, pnum)
                test_html = open_url(self.fetchers[self.main_fetcher], url)
                time.sleep(
                    random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                                   2 * Config.SLEEP_BETWEEN_2FPAGES))
                no_post = 'W_icon icon_warnB'
                if no_post in test_html:
                    pnum -= 1  # fixing page number
                else:
                    break
            return pnum, htmls

    def get_profile(self, pid):
        '''
        get profile information for User marked with pid
        :param pid: page id
        :return:
        '''

        url = 'http://www.weibo.com/p/%s/info?mod=pedit_more' % (pid, )

        uid = pid[6:]
        is_taobao = None
        while is_taobao is None:
            try:

                is_taobao = self.is_taobao(
                    uid)  # get taobao information in advance
                if is_taobao == -1:
                    self.ban_account()
                    is_taobao = None
                elif is_taobao == -2:
                    self.reset_account()
                    is_taobao = None

            except Exception as e:
                print e.message
            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

        profile = None
        print 'Getting profile page...'
        while profile is None:
            fetcher = self.fetchers[self.main_fetcher]
            html = open_url(fetcher, url)

            if self.parser.parse_uid(
                    html
            ) == -1:  # if -1 that means this working account maybe be banned
                self.ban_account()
                continue
            elif self.parser.is_visitor(
                    html
            ) is True:  # judge whether working account falls into visitor status
                self.reset_account()
                continue

            profile = self.parser.parse_profile(html, pid, is_taobao,
                                                datetime.now())
            time.sleep(
                random.randint(Config.SLEEP_BETWEEN_2FPAGES,
                               2 * Config.SLEEP_BETWEEN_2FPAGES))

        self.profile_list.append(profile)

    def is_taobao(self, uid):
        '''

        :param uid: user ID
        :return: a boolean value ('1' or '0') indicating whether user is a taobao shopkeeper
        '''
        fetcher = self.fetchers[self.main_fetcher]
        url = 'http://www.weibo.com/u/' + uid
        html = open_url(fetcher, url)

        with open('debug_taobao.txt', 'w') as writer:
            writer.write(html)

        if self.parser.parse_uid(html) == -1:
            return -1  # account is banned
        elif self.parser.is_visitor(html) is True:
            return -2  # account is in visitor status

        return self.parser.parse_is_taobao(html)

    def save(self):
        '''
        save crawled information to DB
        :return:
        '''

        self.transformation()
        self.clear_null_data(
        )  # this function must be called after self.transformation

        self.database.connect()

        for fee in self.followee_list:
            self.database.session.merge(Dao.Followee(fee))
        for fer in self.follower_list:
            self.database.session.merge(Dao.Follower(fer))
        for tl in self.timeline_list:
            try:
                tl['text'] = tl['text'].replace(
                    '', ' ')  #  is /001, so it's necessary to eliminate it
                tl['text'] = tl['text'].replace('\r', ' ').replace(
                    '\n', ' ')  # remove all the linefeed
            except Exception as e:
                print e.message
            self.database.session.merge(Dao.Timeline(tl))
        for pf in self.profile_list:
            for jb in pf['Job']:
                self.database.session.merge(Dao.Job(jb))
            for edu in pf['Education']:
                self.database.session.merge(Dao.Education(edu))

            del pf['Job']
            del pf['Education']

            self.database.session.merge(Dao.User(pf))

        self.clear_buffer()

        self.database.close()

    def save_only_profile(self):

        self.transformation()
        self.clear_null_data(
        )  # this function must be called after self.transformation

        self.database.connect()

        for pf in self.profile_list:
            for jb in pf['Job']:
                self.database.session.merge(Dao.Job(jb))
            for edu in pf['Education']:
                self.database.session.merge(Dao.Education(edu))

            del pf['Job']
            del pf['Education']

            self.database.session.merge(Dao.User(pf))

        self.clear_buffer()

        self.database.close()

    def transformation(self):
        '''
        transfer keys of self.followee_list, self.follower_list, self.timeline_list and self.profile_list to fit field names in database
        :return:
        '''

        for fee in self.followee_list:
            fee['uid'] = fee.pop('uid')
            fee['fee_uid'] = fee.pop('fee_uid')
            fee['fee_name'] = fee.pop('name')
            fee['fee_profile_img_url'] = fee.pop('profile_img')
            fee['fee_description'] = fee.pop('description')
            fee['fee_gender'] = fee.pop('gender')
            fee['fee_location'] = fee.pop('location')
            fee['fee_by'] = fee.pop('app_source')
            fee['fee_followee_num'] = fee.pop('followee_num')
            fee['fee_follower_num'] = fee.pop('follower_num')
            fee['fee_weibo_num'] = fee.pop('weibo_num')
            fee['fee_verified_type'] = fee.pop('verified_type')
            fee['fee_is_vip'] = fee.pop('is_vip')
            fee['fee_vip_level'] = fee.pop('vip_level')
            fee['fee_is_daren'] = fee.pop('is_daren')
            fee['fee_is_taobao'] = fee.pop('is_taobao')
            fee['fee_is_suishoupai'] = fee.pop('is_suishoupai')
            fee['fee_is_vlady'] = fee.pop('is_vlady')
            fee['fee_timestamp'] = fee.pop('timestamp')

        for fer in self.follower_list:
            fer['uid'] = fer.pop('uid')
            fer['fer_uid'] = fer.pop('fer_uid')
            fer['fer_name'] = fer.pop('name')
            fer['fer_profile_img_url'] = fer.pop('profile_img')
            fer['fer_description'] = fer.pop('description')
            fer['fer_gender'] = fer.pop('gender')
            fer['fer_location'] = fer.pop('location')
            fer['fer_by'] = fer.pop('app_source')
            fer['fer_followee_num'] = fer.pop('followee_num')
            fer['fer_follower_num'] = fer.pop('follower_num')
            fer['fer_weibo_num'] = fer.pop('weibo_num')
            fer['fer_verified_type'] = fer.pop('verified_type')
            fer['fer_is_vip'] = fer.pop('is_vip')
            fer['fer_vip_level'] = fer.pop('vip_level')
            fer['fer_is_daren'] = fer.pop('is_daren')
            fer['fer_is_taobao'] = fer.pop('is_taobao')
            fer['fer_is_suishoupai'] = fer.pop('is_suishoupai')
            fer['fer_is_vlady'] = fer.pop('is_vlady')
            fer['fer_timestamp'] = fer.pop('timestamp')

        for tl in self.timeline_list:
            tl['mid'] = tl.pop('mid')
            tl['encrypt_mid'] = tl.pop('encrypted_mid')
            tl['uid'] = tl.pop('uid')
            tl['retweet_num'] = tl.pop('retweet')
            tl['comment_num'] = tl.pop('comment')
            tl['favourite_num'] = tl.pop('favourite')
            tl['created_at'] = tl.pop('created_at')
            tl['from'] = tl.pop('app_source')
            tl['text'] = tl.pop('text')
            tl['entity'] = tl.pop('entity')
            tl['source_mid'] = tl.pop('source_mid')
            tl['source_uid'] = tl.pop('source_uid')
            tl['mentions'] = tl.pop('mentions')
            tl['check_in'] = tl.pop('check_in')
            tl['check_in_url'] = tl.pop('check_in_url')
            tl['is_deleted'] = tl.pop('is_deleted')
            tl['timestamp'] = tl.pop('timestamp')

        for pf in self.profile_list:
            for jb in pf['Job']:
                jb['uid'] = pf['uid']
                jb['timestamp'] = pf['timestamp']

                jb['company'] = jb.pop('company')
                jb['location'] = jb.pop('location')
                jb['occupation'] = jb.pop('occupation')
                jb['time_period'] = jb.pop('period')

            for edu in pf['Education']:
                edu['uid'] = pf['uid']
                edu['timestamp'] = pf['timestamp']

                edu['school_name'] = edu.pop('university')
                edu['time_period'] = edu.pop('period')
                edu['department'] = edu.pop('department')
                edu['type'] = edu.pop('type')

            pf['uid'] = pf.pop('uid')
            pf['screen_name'] = pf.pop('nickname')
            pf['real_name'] = pf.pop('name')
            pf['location'] = pf.pop('location')
            pf['gender'] = pf.pop('gender')
            pf['sexual_orientation'] = pf.pop('sexual_orientation')
            pf['relationship_status'] = pf.pop('relationship_status')
            pf['birthday'] = pf.pop('birthday')
            pf['blood_type'] = pf.pop('blood_type')
            pf['blog'] = pf.pop('blog')
            pf['description'] = pf.pop('description')
            pf['email'] = pf.pop('email')
            pf['QQ'] = pf.pop('QQ')
            pf['MSN'] = pf.pop('MSN')
            pf['tag'] = pf.pop('tag')
            pf['followee_num'] = pf.pop('followee_num')
            pf['follower_num'] = pf.pop('follower_num')
            pf['weibo_num'] = pf.pop('weibo_num')
            pf['created_at'] = pf.pop('created_at')
            pf['profile_img_url'] = pf.pop('profile_img')
            pf['domain_id'] = pf.pop('domain_id')
            pf['domain_name'] = pf.pop('domain_name')
            pf['level'] = pf.pop('level')
            pf['experience'] = pf.pop('experience')
            pf['credit_level'] = pf.pop('credit_level')
            pf['credit_point'] = pf.pop('credit_point')
            pf['credit_history'] = pf.pop('credit_history')
            pf['is_vip'] = pf.pop('is_vip')
            pf['vip_level'] = pf.pop('vip_level')
            pf['is_yearly_pay'] = pf.pop('is_yearly_paid')
            pf['is_verified'] = pf.pop('is_verified')
            pf['verified_reason'] = pf.pop('verified_reason')
            pf['is_daren'] = pf.pop('is_daren')
            pf['daren_type'] = pf.pop('daren_type')
            pf['daren_point'] = pf.pop('daren_point')
            pf['daren_interest'] = pf.pop('daren_interest')
            pf['is_taobao'] = pf.pop('is_taobao')
            pf['not_exist'] = pf.pop('not_exist')
            pf['timestamp'] = pf.pop('timestamp')

    def clear_buffer(self):
        '''
        clear memory buffer after storing the information
        :return:
        '''

        self.followee_list = []
        self.follower_list = []
        self.timeline_list = []
        self.profile_list = []

    def clear_null_data(self):
        '''
        clear empty or None data for all information
        :return:
        '''
        followee_list = []
        follower_list = []
        timeline_list = []
        profile_list = []

        for fee in self.followee_list:
            dict = {}
            for key in fee:
                if fee[key] is None or fee[key] == '':
                    continue
                else:
                    dict[key] = fee[key]
            followee_list.append(dict)
        self.followee_list = followee_list

        for fer in self.follower_list:
            dict = {}
            for key in fer:
                if fer[key] is None or fer[key] == '':
                    continue
                else:
                    dict[key] = fer[key]
            follower_list.append(dict)
        self.follower_list = follower_list

        for tl in self.timeline_list:
            dict = {}
            for key in tl:
                if tl[key] is None or tl[key] == '':
                    continue
                else:
                    dict[key] = tl[key]
            timeline_list.append(dict)
        self.timeline_list = timeline_list

        for pf in self.profile_list:
            dict = {}
            for key in pf:
                if pf[key] is None or pf[key] == '':
                    continue
                else:
                    dict[key] = pf[key]
            profile_list.append(dict)
        self.profile_list = profile_list

        for pf in self.profile_list:
            job_list = []
            edu_list = []
            for job in pf['Job']:
                dict = {}
                for key in job:
                    if job[key] is None or job[key] == '':
                        continue
                    else:
                        dict[key] = job[key]
                job_list.append(dict)
            pf['Job'] = job_list

            for edu in pf['Education']:
                dict = {}
                for key in edu:
                    if edu[key] is None or edu[key] == '':
                        continue
                    else:
                        dict[key] = edu[key]
                edu_list.append(dict)
            pf['Education'] = edu_list

    def set_user_deleted_by_uid(self, uid):
        db = Database()
        db.connect()

        cursor = db.session.query(Dao.Task).filter(Dao.Task.uid == uid).one()
        cursor.is_deleted = '1'

        db.close()