コード例 #1
0
ファイル: xueqiu.py プロジェクト: cbbing/wealth_spider
    def get_user_activity_info(self, user_id):
        url = 'http://xueqiu.com/{}'.format(user_id)
        print url
        #r = get_requests(url, self.df_ip)
        driver = get_web_driver(url, has_proxy=False)
        max_window(driver)

        # 获取原发布的总页码
        soup = BeautifulSoup(driver.page_source, 'html5lib')
        page_count = self._get_page_count(soup, 'statusLists')

        # 获取数据库中的最新发表文章时间
        publish_time_lastest = self._get_lastest_publish_time(mysql_table_xueqiu_article, user_id)

        # 获取每页文章列表
        current_page = 1
        while(current_page < page_count+1):
            print "Page:%d / %d" % (current_page, page_count)

            archiveList = self._get_archive_list_in_one_page(soup, user_id)

            # 存入mysql
            [archive.to_mysql() for archive in archiveList] #不需判断数据库是否存在,若存在则抛出异常,不插入


            if len(archiveList) > 0:

                archive = archiveList[-1]

                # 判断是否存在最新文章
                #d1 = str_to_datatime(archive.publish_time)
                #d2 = str_to_datatime(str(publish_time_lastest))
                #if d1 < d2:
                if archive.publish_time < str(publish_time_lastest):
                    print encode_wrap('雪球: 已经是最新的动态了')
                    break

                # 判断文章是否为最近一年发布,若否则不继续搜索
                nowDate = GetNowTime2()
                now_year = int(nowDate[:4])
                last_year = nowDate.replace(str(now_year), str(now_year-1)) # 去年今日
                if archive.publish_time < last_year:
                    break

            # 点击下一页
            clickStatus = self._click_next_page(driver,'//ul[@class="status-list"]', current_page+1)
            if clickStatus:
                soup = BeautifulSoup(driver.page_source, 'html5lib')
                current_page += 1
                wait_time = self._get_wait_time()
                time.sleep(wait_time)
                print 'Page:{}   Wait time:{}'.format(current_page, wait_time)
            else:
                print encode_wrap('点击下一页出错, 退出...')
                break

            if current_page > 5:
                break

        driver.quit()
コード例 #2
0
ファイル: xueqiu.py プロジェクト: cbbing/wealth_spider
    def get_follow_list(self, id, list_type='friends_content'):
        try:

            if list_type is 'friends_content':
                search_time_column = 'follow_search_time'
            else:
                search_time_column = 'fans_search_time'

            # 过滤已走路径
            query = "select %s from %s where user_id='%s'" % (search_time_column, big_v_table_mysql, id )
            df_query = pd.read_sql_query(query, engine)
            if len(df_query) > 0 :
                data = df_query.ix[0, search_time_column]
                if not data is None and len(df_query.ix[0, search_time_column]) > 0:
                    print 'has get follow (%s)' % id
                    return

            self._check_if_need_update_ip()


            url = 'http://xueqiu.com/%s' % str(id)
            driver = self.get_web_driver(url)
            #driver.get(url)

            max_window(driver)

            # 模拟点击“关注”
            driver.find_element_by_xpath('//a[@href="#{0}"]'.format(list_type)).click()
            soup = BeautifulSoup(driver.page_source, 'html5lib')

            # 获取关注的总页码
            page_count = self._get_page_count(soup, list_type)

            follow_list = []
            current_page = 1
            while(current_page < page_count+1):

                print "Page:%d / %d" % (current_page, page_count)

                try:
                    # add friends where follows>1000
                    follow_list_one_page = self._get_fans_list_in_one_page(soup)
                    follow_list.extend(follow_list_one_page)

                    t0 = time.time()

                    # 获取关注列表中的大V信息(每一页)
                    #多线程
                    pool = ThreadPool(processes=10)
                    pool.map(self.get_BigV_Info, follow_list_one_page)
                    pool.close()
                    pool.join()

                    current_page += 1

                    t1 = time.time()
                    wait_time = max((self._get_wait_time() - (t1-t0)), 0)
                    time.sleep(wait_time)
                    print 'Page:{}   Wait time:{}'.format(current_page, wait_time)



                    # 点击下一页
                    clickStatus = self._click_next_page(driver,'//ul[@class="users-list"]', current_page)
                    if clickStatus:
                        soup = BeautifulSoup(driver.page_source, 'html5lib')


                    else:
                        print encode_wrap('无下一页{0}, 退出...'.format(current_page))
                        break

                except Exception,e:
                    print e
                    break

            print 'fans count:', len(follow_list)
            driver.quit()
            if not len(follow_list):
                return []



            # for follow in followList:
            #     self.get_BigV_Info(follow)

            # 保存到数据库:粉丝中的大V列表
            self._big_v_in_fans_to_sql(follow_list, id)

            # 标记已搜寻关注列表

            sql = 'update {0} set {1} = "{2}" where user_id = "{3}"'.format(big_v_table_mysql, search_time_column, GetNowTime(), id)
            engine.execute(sql)

            return follow_list