コード例 #1
0
    def __init__(self, db_server, api_server, proxy_server, client_uid, similar_min=15):
        self.api_server = api_server
        self.client_uid = client_uid

        self.account_pool = AccountPool(db_server, api_server, proxy_server)
        self.user_pool = UserPool(db_server, api_server, proxy_server)
        self.proxy_pool = ProxyPool(proxy_server)
        self.record_pool = RecordPool(db_server, api_server)
        self.similar_min = similar_min
コード例 #2
0
ファイル: modules.py プロジェクト: indestinee/proxy_pool
 def __init__(self, args):
     sess = requests.Session()
     sess.mount('https://', HTTPAdapter(max_retries=Retry(total=3)))
     database_client = DB_CLIENTS[args.db](config.DB_NAME)
     log_name = time.strftime('proxy_pool_%Y%m%d_%H%M%S')
     self.args = args
     self.logger = ColorfulLog(LOG_LEVELS[args.level],
                               log_dir=config.LOG_PATH,
                               log_name=log_name)
     self.proxy_pool_client = Client(caller='proxy_pool',
                                     host=args.host,
                                     port=args.port)
     self.proxy_pool = ProxyPool(database_client, sess, self.logger,
                                 self.proxy_pool_client)
コード例 #3
0
    def __init__(self, db_server, api_server, proxy_server):
        self.print('Pending: Start initializing the account pool')
        self.api_server = api_server
        self.db_server = db_server

        self.session = requests.session()

        self.db = pymongo.MongoClient(self.db_server, 27017).net_ease.account

        self.proxy_pool = ProxyPool(proxy_server)
        self.login_accounts()
        self.refill_thread = threading.Thread(target=self.refill_tasks)
        self.refill_thread.start()

        self.print('Success: Finish initializing the account pool')
コード例 #4
0
ファイル: task.py プロジェクト: KomorebiSaw/proxy_pool
def verify_proxy_useful():
    """
    定时任务, 验证 useful 中的代理是否可用
    :return:
    """
    logger.info('开始验证 useful 代理')
    count = ProxyPool.count()
    if count == 0:
        logger.info('useful 代理数量为零, 验证 done')
        return
    with futures.ThreadPoolExecutor(max_workers=10) as executor:
        proxys = ProxyPool.all()
        for proxy, result in zip(proxys,
                                 executor.map(_verify_proxy_useful, proxys)):
            # 无需知道结果
            pass
    logger.info('验证 useful 代理 done')
コード例 #5
0
ファイル: task.py プロジェクト: KomorebiSaw/proxy_pool
def crawl_proxy():
    """
    定时任务, 间隔一定时间运行一次爬虫
    :return:
    """
    logger.info('开始运行爬虫...')
    if ProxyPool.full():
        logger.info('代理数量大于{}, 停止爬取'.format(max_num))
        return

    for ip_ports in _crawl_proxy():
        _save_to_db(ip_ports)
    logger.info('爬取任务 done')
コード例 #6
0
class Client:

    client_uid = ''
    client_song_id_set = set()

    api_server = ''

    user_pool = ''
    proxy_pool = ''
    account_pool = ''
    record_pool = ''

    uid_queue = ''

    most_similar_uid = 0
    same_song_num = -1

    similar_user_list = []
    similar_min = 15

    fail_search = 0
    success_search = 0
    cheat_search = 0
    block_search = 0


    threads = []
    terminate = False

    def __init__(self, db_server, api_server, proxy_server, client_uid, similar_min=15):
        self.api_server = api_server
        self.client_uid = client_uid

        self.account_pool = AccountPool(db_server, api_server, proxy_server)
        self.user_pool = UserPool(db_server, api_server, proxy_server)
        self.proxy_pool = ProxyPool(proxy_server)
        self.record_pool = RecordPool(db_server, api_server)
        self.similar_min = similar_min
        
    def print(self, content):
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), end=': ')
        print(content)

    def set_terminate(self):
        self.proxy_pool.set_terminate()
        self.user_pool.set_terminate()
        self.account_pool.set_terminate()
        self.terminate = True





    def get_all_client_song_ids(self):
        play_list_ids = self.get_all_client_play_list_ids()
        threads = []
        for play_list_id in play_list_ids:
            thread = threading.Thread(target=self.get_all_song_ids_in_play_list_to_set, args=[play_list_id])
            thread.start()
            threads.append(thread)
            # self.get_all_song_ids_in_play_list_to_set(play_list_id)
        for thread in threads:
            thread.join()
        self.print('Success: Finish fetching all ' + str(len(self.client_song_id_set)) + ' client song ids')

    def get_all_client_play_list_ids(self):
        get_play_list_api = '/user/playlist'
        params = {'uid': self.client_uid}
        response = requests.get(self.api_server + get_play_list_api, params=params)
        play_lists = response.json()['playlist']
        play_list_ids = []
        for play_list in play_lists:
            creator_id = play_list['creator']['userId']
            play_list_id = play_list['id']
            if creator_id == self.client_uid:
                play_list_ids.append(play_list_id)
                # print(play_list_id)
        return play_list_ids

    def get_all_song_ids_in_play_list_to_set(self, play_list_id):
        get_all_song_ids_in_play_list_api = '/playlist/detail'
        params = {'id': play_list_id}
        response = requests.get(self.api_server + get_all_song_ids_in_play_list_api, params=params)
        songs = response.json()['playlist']['tracks']
        for song in songs:
            song_id = song['id']
            self.client_song_id_set.add(song_id)
            








    def get_favourite_id_set(self, uid):
        get_favourite_api = '/user/record'
        params = {'uid': uid, 'type': 0}

        cookie_unit = self.account_pool.get_cookie_unit()
        cookies = cookie_unit['cookies']

        if not self.account_pool.is_available() or not self.proxy_pool.is_available():
            # print('Fail: The account pool or proxy pool is not available')
            self.uid_queue.put(uid)
            return []
        response = requests.get(self.api_server + get_favourite_api, params=params, proxies=self.proxy_pool.get(), cookies=cookies).json()

        if response['code'] == -460:
            # self.print('Fail: Detect cheating')
            self.fail_search += 1
            self.cheat_search += 1
            self.account_pool.remove_cheat_source(cookie_unit['phone'])
            return []

        if response['code'] == -2:
            # self.print('Fail: The user ' + str(uid) + ' block the favourite playlist')
            self.user_pool.delete_one_user(uid)
            self.fail_search += 1
            self.block_search += 1
            return []
        
        songs = response['allData']
        song_ids = set()
        for song in songs:
            song_ids.add(song['song']['song']['id'])
        self.success_search += 1

        total = self.success_search + self.fail_search
        if total % 50 == 0:
            self.print('Success: Finish ' + str(total) + ' in total, ' + str(self.success_search) + ' success , ' + str(self.cheat_search) + ' cheat, ' + str(self.block_search) + ' block')
            self.print('The most similar user found is ')
            print(self.similar_user_list)
        return song_ids

    def compare_song_list_with_one_uid_thread(self):
        while not self.terminate:
            self.compare_song_list_with_one_uid()

    def compare_song_list_with_one_uid(self):
        if self.uid_queue.qsize() > 0:
            target_user =  self.uid_queue.get()
            target_uid =  target_user['uid']
            target_nickname = target_user['nickname']
            target_gender = target_user['gender']

            target_favourite_song_id_set = self.get_favourite_id_set(target_uid)
            count = 0
            for song_id in target_favourite_song_id_set:
                if song_id in self.client_song_id_set:
                    count += 1
            if count > self.similar_min:
                target_user = {'target_uid': target_uid, 'same_num': count, 'target_nickname': target_nickname, 'target_gender': target_gender}
                self.similar_user_list.append(target_user)
                self.record_pool.upload_one_record(self.client_uid, target_user)
        else: 
            self.set_terminate()


    def find_most_similar_user_in_samples(self, sample_num, special):
        self.print('Pending: Start looking for most similar user')
        start_time = datetime.datetime.now()

        # determine the sample
        if special:
            self.uid_queue = self.user_pool.get_girl_user_sample_queue(sample_num)
        else:
            self.uid_queue = self.user_pool.get_uid_sample_queue(sample_num)

        self.get_all_client_song_ids()
        for i in range(0, 100):
            thread = threading.Thread(target=self.compare_song_list_with_one_uid_thread)
            self.threads.append(thread)
            thread.start()
        
        for thread in self.threads:
            thread.join()
        
        end_time = datetime.datetime.now()
        run_time = end_time - start_time
        self.set_terminate()

        self.print('Success: ' + str(self.success_search) + ' success search in ' + str(run_time.total_seconds()) + ' seconds')
        
        # self.record_pool.upload_all_records(self.client_uid, self.similar_user_list)
        # self.print('The most similar user found is ' + str(self.most_similar_uid))
        # self.print('You have ' + str(self.same_song_num) + ' songs in common')
コード例 #7
0
class UserPool:

    api_server = ''

    db = ''

    proxy_pool = ''

    upload_queue = queue.Queue()
    upload_queue_min_size = 0
    upload_queue_max_size = 1000

    waiting_for_search_queue = queue.Queue()
    waiting_for_search_queue_min_size = 10
    waiting_for_search_queue_max_size = 500

    upload_threads = []
    refill_threads = []
    search_threads = []

    success_upload = 0
    fail_upload = 0
    uploaded_num = 0

    terminate = False
    

    def __init__(self, db_server, api_server, proxy_server):
        self.api_server = api_server
        self.db = pymongo.MongoClient(db_server, 27017).net_ease.user
        self.proxy_pool = ProxyPool(proxy_server)

    def print(self, content):
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), end=': ')
        print(content)
        
    def set_terminate(self):
        self.terminate = True
        self.proxy_pool.set_terminate()

    def delete_all_users(self):
        self.db.delete_many({})
        self.print('Success: Finish delete all users')

    def delete_one_user(self, uid):
        self.db.delete_one({'uid': uid})

    def delete_duplicates(self):
        self.print('Success: Start Delete duplicates')
        cursor = self.db.aggregate(
            [
                {"$group": {"_id": "$uid", "unique_ids": {"$addToSet": "$_id"}, "count": {"$sum": 1}}},
                {"$match": {"count": { "$gte": 2 }}}
            ]
        )
        response = []
        for doc in cursor:
            del doc["unique_ids"][0]
            for id in doc["unique_ids"]:
                response.append(id)
        print(response)
        self.db.remove({"_id": {"$in": response}})
        self.print('Success: Finish deleting ' + str(len(response)) + ' duplicates')







    def search_neighbours_thread(self):
        while not self.terminate:
            if self.upload_queue.qsize() < self.upload_queue_max_size:
                self.search_neighbours()


    def search_neighbours(self):
        if self.waiting_for_search_queue.qsize() > 0:
            get_followers_api = '/user/followeds'
            uid = self.waiting_for_search_queue.get()
            params = {'uid': uid}
            response = requests.get(self.api_server + get_followers_api, params=params, proxies=self.proxy_pool.get()).json()
            if response['code'] != 200:
                self.print('Fail: Unable to search neighbours of ' + str(uid))
                self.print(response)
                return False
            neighbours = response['followeds']
            # set the uid searched
            myquery = { 'uid': uid }
            newvalues = { "$set": { "searched": True } }
            self.db.update_one(myquery, newvalues)
            # put the result in the upload queue
            for neighbour in neighbours:
                user = {
                    'uid': neighbour['userId'],
                    'gender': neighbour['gender'],
                    'nickname': neighbour['nickname'],
                    'searched': False,
                    'gender': neighbour['gender'],
                }
                self.upload_queue.put(user)

    def upload_one_user(self, user):
        try:
            self.db.insert_one(user)
        except:
            self.fail_upload += 1
        self.success_upload += 1


    def upload_result(self):
        if self.upload_queue.qsize() > 0:
            user = self.upload_queue.get()
            self.upload_one_user(user)
            if (self.fail_upload+self.success_upload) % 200 == 0:
                self.print('Success: Finish upload ' + str(self.fail_upload+self.success_upload) + ' results, ' + str(self.success_upload) + ' success, ' + str(self.fail_upload) + ' fail')
                self.print('Success: ' + str(self.upload_queue.qsize()) + ' to be uploaded ' + str(self.waiting_for_search_queue.qsize()) + ' waiting for search')
        if self.upload_queue.qsize() < self.waiting_for_search_queue_max_size:
            self.search_neighbours()

    def upload_thread(self):
        while not self.terminate:
            self.upload_result()
            




    def refill_waiting_for_search_queue(self, size):
        users = list(self.db.find({ 'searched': False }).limit(size))
        for user in users:
            self.waiting_for_search_queue.put(user['uid'])
        self.print('Success: Finish refill the task queue with ' + str(len(users)) + ' data' + ', ' + str(self.waiting_for_search_queue.qsize()) + ' wating for search' )

    def refill_waiting_for_search_queue_thread(self):
        while not self.terminate:
            if self.waiting_for_search_queue.qsize() < self.waiting_for_search_queue_min_size:
                self.refill_waiting_for_search_queue(1000)




    def start_searching_valid_users(self, upload_thread_num):
        self.print('Pending: Start searching valid users')
        
        thread = threading.Thread(target=self.refill_waiting_for_search_queue_thread)
        self.refill_threads.append(thread)
        thread.start()

        for i in range(0, 100):
            thread = threading.Thread(target=self.upload_thread)
            self.refill_threads.append(thread)
            thread.start()
        





        
    def get_uid_sample_queue(self, size):
        user_queue = queue.Queue()
        query = [
            { '$sample': { 'size': size } },
            { '$match': {'searched': False} }
        ]
        for user in self.db.aggregate(query):
            user_queue.put(user)
        return user_queue



    def get_girl_user_sample_queue(self, size):
        user_queue = queue.Queue()
        query = [
            { '$sample': { 'size': size } },
            { '$match': {'gender': 2} }
        ]
        for user in self.db.aggregate(query):
            user_queue.put(user)
        print('')
        print(user_queue.qsize())
        print('')
        return user_queue
コード例 #8
0
 def __init__(self, db_server, api_server, proxy_server):
     self.api_server = api_server
     self.db = pymongo.MongoClient(db_server, 27017).net_ease.user
     self.proxy_pool = ProxyPool(proxy_server)
コード例 #9
0
import sys, os
from pprint import pprint
import pymysql
import random

from utils import year_generator
from daily import *
from proxy_pool import ProxyPool
from settings import *

pp = ProxyPool()

if __name__ == '__main__':
    sid = int(sys.argv[1])
    use_proxy = int(sys.argv[2])
    start = int(sys.argv[3])
    for date in year_generator(start_year=start):
        print(date)
        dd = crawl_daily_data(sid, date, use_proxy)
        if dd:
            insert_daily_data(sid, dd)
        bd = crawl_daily_bwibbw(sid, date, use_proxy)
        if bd:
            insert_bwibbw_data(sid, bd)
        time.sleep(random.randint(5, 15))
コード例 #10
0
    queue = MyPriorityQueue(maxsize=config.queue_num)
    await proxy.init_proxy_pool(config.local_num)
    producer = []
    for idx, url in config.urls[place].items():
        loop.create_task(
            douban_producer(queue, proxy, place, idx, url, 1, end_page,
                            config.producer_time))
    consumer = [
        loop.create_task(douban_consumer(queue, proxy, i, config.consumer_num))
        for i in range(config.consumer_num)
    ]
    await asyncio.wait(consumer + producer)


if __name__ == "__main__":
    proxy = ProxyPool()
    event_loop = asyncio.get_event_loop()
    print("请输入对应的数字选择初始化模式")
    print("1 全部抓取")
    print("2 选择地区进行抓取")
    flag = input()
    if flag == "1":
        print("你已选择 模式1 全部抓取 请输入抓取页数")
        end_page = input()
        event_loop.run_until_complete(
            model_one(event_loop, proxy, int(end_page)))
    else:
        print("你已选择 模式2 选择地区抓取 请输入对应数字选择抓取地区")
        place_map = {
            idx: place
            for idx, place in enumerate(config.urls.keys())
コード例 #11
0
ファイル: daily.py プロジェクト: xero7689/xStockSystem
headers = {
    'User-Agent':
    'User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/61.0.3163.100 Chrome/61.0.3163.100 Safari/537.36',
    'Referer': 'http://www.tse.com.tw/zh/page/trading/exchange/BWIBBU.html'
}

no_match_data_byte_string = b'\xe5\xbe\x88\xe6\x8a\xb1\xe6\xad\x89\xef\xbc\x8c\xe6\xb2\x92\xe6\x9c\x89\xe7\xac\xa6\xe5\x90\x88\xe6\xa2\x9d\xe4\xbb\xb6\xe7\x9a\x84\xe8\xb3\x87\xe6\x96\x99!'.decode(
    'utf8')
# Get Stock_id
_get_sid = """
SELECT stock_id FROM stock_list
"""

# Global proxy queue
proxy_pool = ProxyPool()


def _get_twsec_data(twsec_url, headers=None, use_proxy=False):
    max_retry = 5
    data = None
    while True:
        try:
            if use_proxy:
                ip, port, delay, count = proxy_pool.get()
                if not ip:
                    print('no proxy can use')
                    use_proxy = None
                    continue
                proxies = {
                    'http': 'http://{}:{}'.format(ip, port),
コード例 #12
0
ファイル: app.py プロジェクト: KomorebiSaw/proxy_pool
def count():
    n = ProxyPool.count()
    return str(n)
コード例 #13
0
ファイル: app.py プロジェクト: KomorebiSaw/proxy_pool
def get():
    p = ProxyPool.get()
    return p
コード例 #14
0
ファイル: test_proxy.py プロジェクト: lxyangfan/lianjia_fetch
from proxy_pool import ProxyPool

if __name__ == '__main__':
    pool = ProxyPool()
    proxy = pool.getproxy()
    print proxy
コード例 #15
0
class AccountPool:

    api_server = ''
    db_server = ''

    db = ''

    source_cookies = []

    cookie_queue = queue.Queue()
    cookie_queue_max_size = 1000
    cookie_queue_min_size = 200

    proxy_pool = ''

    refill_thread = ''
    terminate = False

    account_for_login_queue = queue.Queue()
    success_login = 0
    fail_login = 0
    login_threads = []

    error_accounts = set()

    session = ''

    lowerst_account_num = 80

    def __init__(self, db_server, api_server, proxy_server):
        self.print('Pending: Start initializing the account pool')
        self.api_server = api_server
        self.db_server = db_server

        self.session = requests.session()

        self.db = pymongo.MongoClient(self.db_server, 27017).net_ease.account

        self.proxy_pool = ProxyPool(proxy_server)
        self.login_accounts()
        self.refill_thread = threading.Thread(target=self.refill_tasks)
        self.refill_thread.start()

        self.print('Success: Finish initializing the account pool')

    def print(self, content):
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), end=': ')
        print(content)

    def insert_one_phone(self, phone, password):
        sames = list(self.db.find({'phone': str(phone)}))
        if len(sames) > 0:
            self.print('Fail: Unable to insert repeated ' + str(phone))
            return False
        self.db.insert_one({'phone': str(phone), 'password': password})
        self.print('Success: Finish inserting phone ' + str(phone))
        return True

    def insert_all_phones(self, phones, password):
        success = 0
        fail = 0
        for phone in phones:
            if self.insert_one_phone(phone, password):
                success += 1
            else:
                fail += 1
        self.print('Success: Finish inserting all phones, ' + str(success) +
                   ' success, ' + str(fail) + ' fail')

    def delete_all_phones(self):
        self.db.delete_many()
        self.print('Success: Finish deleting all phones')

    def login_accounts(self):
        accounts = list(self.db.find())
        for account in accounts:
            self.account_for_login_queue.put(account)
        for i in range(0, 100):
            thread = threading.Thread(target=self.login_thread)
            self.login_threads.append(thread)
            thread.start()
        for thread in self.login_threads:
            thread.join()
        self.print('Success: Finish login, ' + str(self.success_login) +
                   ' success, ' + str(self.fail_login) + ' fail')

    def login_one_account(self):
        if self.account_for_login_queue.qsize() > 0:
            account = self.account_for_login_queue.get()
            params = {
                'phone': account['phone'],
                'password': account['password']
            }
            response = self.session.get(self.api_server + '/login/cellphone',
                                        params=params,
                                        proxies=self.proxy_pool.get())
            if response.json()['code'] == 415:
                self.print('Fail: Unable to login for ' +
                           str(account['phone']) +
                           ', the proxy is invalid, try again later')
                self.account_for_login_queue.put(account)
                self.fail_login += 1
                return
            if response.json()['code'] == 406:
                if self.success_login > self.lowerst_account_num:
                    return
                self.print('Fail: The account ' +
                           str(account['phone'] + ' cannot login'))
                self.error_accounts.add(account['phone'])
                self.account_for_login_queue.put(account)
                self.fail_login += 1
                return
            if response.json()['code'] == 460:
                self.print('Fail: Cheating')
                self.fail_login += 1
                return
            if account['phone'] in self.error_accounts:
                print('miracle!!!!!!')
            self.success_login += 1
            self.source_cookies.append({
                'phone': account['phone'],
                'cookies': response.cookies
            })

    def login_thread(self):
        while self.account_for_login_queue.qsize() > 0:
            self.login_one_account()

    def refill(self):
        for cookie_unit in self.source_cookies:
            self.cookie_queue.put(cookie_unit)

    def refill_tasks(self):
        while not self.terminate:
            if self.cookie_queue.qsize() < self.cookie_queue_max_size:
                self.refill()
            else:
                time.sleep(1)

    def set_terminate(self):
        self.terminate = True
        self.proxy_pool.set_terminate()

    def is_available(self):
        return self.cookie_queue.qsize() > self.cookie_queue_min_size

    def get_cookie_unit(self):
        return self.cookie_queue.get()

    def load_accounts(self, filename):
        file = open(filename, 'r')
        for line in file:
            is_pass = False
            username = ''
            password = ''
            for word in line.split():
                if not is_pass:
                    username = word
                    is_pass = True
                else:
                    password = word
            self.insert_one_phone(username, password)

    def remove_cheat_source(self, phone):
        for cookie in self.source_cookies:
            if cookie['phone'] == phone:
                self.source_cookies.remove(cookie)