Ejemplo n.º 1
0
    def process_request_origin(self, request, spider):
        redis = RedisQueue('proxy_ip')
        if not redis.empty():
            proxy_ip = redis.get()
        else:
            proxy_ip = get_ip()

        proxy_para = {'ip_port': proxy_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[
            'ip_port']
        redis.put(proxy_ip)
Ejemplo n.º 2
0
    def process_request_origin(self, request, spider):
        redis = RedisQueue('proxy_ip')
        if not redis.empty():
            proxy_ip = redis.get()
        else:
            proxy_ip = get_ip()

        proxy_para = {
                'ip_port': proxy_ip,
                'user_pass': ''
            }
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para['ip_port']
        redis.put(proxy_ip)
Ejemplo n.º 3
0
    def process_exception(self, request, exception, spider):
        request_ip = request.meta['proxy']
        invalid_ip = request_ip.split('//')[1]
        redis = RedisQueue('proxy_ip')
        redis_invalid_ip = RedisQueue('invalid_ip')
        if not redis.empty():
            redis.remove(invalid_ip)
            redis_invalid_ip.put(invalid_ip)
            print '+++++++++++++++++++++++%s' % exception
            print '-----------------------removing ip from redis: %s' % invalid_ip

        new_ip = get_ip()
        proxy_para = {'ip_port': new_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (
            invalid_ip, proxy_para['ip_port'])
        redis.put(new_ip)
Ejemplo n.º 4
0
    def process_exception(self, request, exception, spider):
        request_ip = request.meta['proxy']
        invalid_ip = request_ip.split('//')[1]
        redis = RedisQueue('proxy_ip')
        redis_invalid_ip = RedisQueue('invalid_ip')
        if not redis.empty():
            redis.remove(invalid_ip)
            redis_invalid_ip.put(invalid_ip)
            print '+++++++++++++++++++++++%s' %exception
            print '-----------------------removing ip from redis: %s' %invalid_ip

        new_ip = get_ip()
        proxy_para = {
            'ip_port': new_ip,
            'user_pass': ''
        }
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (invalid_ip,proxy_para['ip_port'])
        redis.put(new_ip)
Ejemplo n.º 5
0
            })
    except Exception as e:
        print_err(e)
        print_err(uname)
        print_err("Something wrong when try to get user's followed topics")
        time.sleep(random.uniform(0, 5))

    return user_questions


if __name__ == '__main__':
    q = RedisQueue('follow_question_queue')
    sleep_time = 0
    db = MongoClient().zhihu.zhihu_follow_questions
    while 1:
        if (q.empty()):
            print('Finished at %s' % str(datetime.datetime.now()))
            print('Waiting ...')
        uname = q.get()
        uname = uname.decode()
        if db.find({'_id': uname}).count() > 0:
            continue

        try:
            with timeout(seconds=40):
                all_questions = get_user_questions(uname)
                if all_questions == {}:
                    continue
                elif all_questions is None:
                    sleep_time += random.uniform(1, 5)
                    print_err('Sleeping for %0.2f seconds' % sleep_time)
Ejemplo n.º 6
0
q = RedisQueue('account_login', **redis_conn)
http = urllib3.PoolManager(num_pools=50)

def worker(value):
    params = {}
    params['account_login'] = base64.encodestring(value)
    r = http.request('POST', author_login, params)

    #服务器失败,重新压回队列
    if r.status != 200:
        q.put(value)

    #IP白名单验证失败,重新压回队列
    if r.data['status'] == 10002:
        q.put(value)
    print r.data

while 1:
    # time.sleep(1);
    if q.empty():
        print 'empty queue'
        break

    s = q.qsize()
    for i in range(0,s):
        value = q.get()
        t = threading.Thread(target=worker, args=(value,))
        t.start()
        if threading.active_count() >= 500:
            time.sleep(1)
Ejemplo n.º 7
0
from RedisQueue import RedisQueue

redis = RedisQueue('0', 'jandan')


def user_agent(url):
    proxy_handler = urllib2.ProxyHandler({'http': '127.0.0.1:8080'})
    opener = urllib2.build_opener(proxy_handler)
    urllib2.install_opener(opener)
    req_header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'
    }
    req_timeout = 20
    req = urllib2.Request(url, None, req_header)
    page = urllib2.urlopen(req, None, req_timeout)
    html = page
    return html


while True:
    while not redis.empty():
        down_url = redis.get()
        print(down_url)
        try:
            data = user_agent(down_url).read()
            with open('./' + down_url[-11:], 'wb') as code:
                code.write(data)
            redis.pop()
        except:
            pass
Ejemplo n.º 8
0
#!/usr/bin/env python

# UniPi Python Control Panel
# clean_redis_queue_2.py
# uses Python 3.5 up
# Author: Johannes Untiedt
# Version 10.0 vom 26.03.2018

import time
from RedisQueue import RedisQueue

if __name__ == '__main__':
    print("clean_redis_queue_2 started")
    q = RedisQueue('ws_2')
    while not q.empty():
        message = q.get()
        print(message)
    print("redis_queue_2 cleaned")
Ejemplo n.º 9
0
import urllib2
from RedisQueue import RedisQueue
redis = RedisQueue('jandan3')

def user_agent(url):
    req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'}
    req_timeout = 20
    req = urllib2.Request(url,None,req_header)
    page = urllib2.urlopen(req,None,req_timeout)
    html = page
    return html

while not redis.empty():
    down_url = redis.get()
    data = user_agent(down_url).read()
    with open('D:/Python/picture'+'/'+down_url[-11:],'wb')as code:
        code.write(data)
    print down_url
Ejemplo n.º 10
0
class machine(multiprocessing.Process):
    def __init__(self, df):
        multiprocessing.Process.__init__(self)

        self.df = df

    def run(self):
        sleep(10)
        self.last_mean = .015
        self.q = RedisQueue('test')
        print('start')
        self.conn = sqlite3.connect("data.db")
        while not self.q.empty():
            features = str(self.q.get())[3:-2].replace("'","").split(', ')
            self.features = list(features)
            for self.hold_time in ['_10']:
                df = self.df[self.features+['stock_perc_change'+self.hold_time, 'abnormal_perc_change'+self.hold_time]]
                targets = [self.df['stock_perc_change'+self.hold_time], self.df['abnormal_perc_change'+self.hold_time]]
                positive_dfs = []
                negative_dfs = []
                for i in range(8):
                    a_train, a_test, b_train, b_test = train_test_split(df.ix[:,:-2], df.ix[:,-2:], test_size=.4)

                    self.train(a_train, b_train)
                    test_result, negative_df, positive_df = self.test(a_test, b_test)
                    if test_result:
                        positive_dfs.append(positive_df)
                        negative_dfs.append(negative_df)
                    else:
                        break

                if test_result:
                    self.get_result(pd.concat(positive_dfs), pd.concat(negative_dfs))



    def train(self, a_train, b_train):
        self.clf = SVR(C=1.0, epsilon=0.2)

        self.clf.fit(a_train, b_train['abnormal_perc_change'+self.hold_time])


    def test(self, a_test, b_test):

        a_test['Predicted'] = self.clf.predict(a_test)
        a_test['Actual_stock_perc_change'+self.hold_time] = b_test['stock_perc_change'+self.hold_time]
        a_test['Actual_abnormal_perc_change'+self.hold_time] = b_test['abnormal_perc_change'+self.hold_time]

        if len(a_test['Predicted'].unique())<40:
            return False, None, None

        a_test = a_test.sort_values(by='Predicted')

        return True, a_test.ix[:,-3:].head(20), a_test.ix[:,-3:].tail(20)

    def get_result(self, df_p, df_n):

        p_result = df_p.describe()
        n_result = df_n.describe()

        if p_result.ix['mean','Actual_abnormal_perc_change_10']<0 or n_result.ix['mean','Actual_abnormal_perc_change_10']>0:
            return
        if p_result.ix['50%','Actual_abnormal_perc_change_10']<0 or n_result.ix['50%','Actual_abnormal_perc_change_10']>0:
            return

        store_me = False
        if p_result.ix['mean','Actual_abnormal_perc_change_10']>self.last_mean:
            self.last_mean = p_result.ix['mean','Actual_abnormal_perc_change_10']
            store_me = True




        p_result.index = p_result.index+'_pos'
        n_result.index = n_result.index+'_neg'

        p_result = p_result.stack().reset_index()
        p_result.index = p_result['level_1'] +'-'+ p_result['level_0']
        p_result = p_result[0]

        n_result = n_result.stack().reset_index()
        n_result.index = n_result['level_1'] +'-'+ n_result['level_0']
        n_result = n_result[0]


        result = p_result.append(n_result)
        result = pd.DataFrame(result).T
        self.model_name = str(self.features)[1:-1]+'__'+self.hold_time[1:]
        result['features'] = self.model_name
        if store_me:

            result.to_sql('results', self.conn, index = False, if_exists='append')
            self.store_machine()

    def store_machine(self):
        df = self.df[self.features]
        target = self.df['abnormal_perc_change'+self.hold_time]

        self.clf = SVR(C=1.0, epsilon=0.2)

        self.clf.fit(df, target)
        from sklearn.externals import joblib
        joblib.dump(self.clf, 'machines/'+self.model_name)
Ejemplo n.º 11
0
    try:
        ip, port = host.strip().split(":")
        proxies = dict(http='socks5://' + host, https='socks5://' + host)
        timeout = 1.0
        resp = requests.get('http://2017.ip138.com/ic.asp',
                            proxies=proxies,
                            timeout=timeout)
        r = resp.text.encode('UTF-8')
        ip = re.findall('\[(\d*?\.\d*?\.\d*?\.\d*?)\]', r)[0]
        if ip:
            rWrite.put(host)
            print "############" + host + "############"
    except:
        pass


while True:
    while not rRead.empty():
        sProxy = rRead.pop()
        lProxy.append(sProxy)
        nCounter = nCounter + 1
        if nCounter % 100 == 0:
            lRequests = threadpool.makeRequests(check, lProxy)
            [pool.putRequest(req) for req in lRequests]
            pool.wait()
            lProxy = []
    if len(lProxy) != 0:
        lRequests = threadpool.makeRequests(check, lProxy)
        [pool.putRequest(req) for req in lRequests]
        pool.wait()
        lProxy = []
class Spider(object):

    def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None):
        self.url = url
        self.key_word = key_word
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'weixin.sogou.com',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        self.session = Session()
        self.queue = RedisQueue()
        self.mysql = MySQL()

    def start(self):
        '''
        初始化工作
        '''
        self.session.headers.update(self.headers)
        start_url = self.url+'?'+urlencode({'query': self.key_word, 'type':2})
        weixin_request = WeiXinRequest(url=start_url, 
                callback=self.parse_index, need_proxy=True)
        self.queue.pusp(weixin_request)

    def parse_index(self, response):
        '''
        页面解析
        '''
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeiXinRequest(url=url, 
                    callback=self.parse_detail, headers=self.headers)
            yield weixin_request
        # 下一页链接
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.url+str(next)
            weixin_request = WeiXinRequest(url=url, 
                    callback=self.parse_index, headers=self.headers)
            yield weixin_request

    def parse_detail(self, response):
        '''
        解析详情
        '''
        doc(response.text)
        data = {
            'title': doc('.rich_media_title').text(),
            'content': doc('rich_medai_content').text(),
            'data': doc('#post-date').text(),
            'nickname': doc('#js_profile_qrcode > div > strong').text(),
            'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
        }
        yield data
    
    def error(self, weixin_request: WeiXinRequest):
        weixin_request.fail_times += 1
        print ('Request Faile: {}, Url is{}'.format(weixin_request.fail_times, 
            weixin_request.url))
        if weixin_request.fail_times < MAX_FAIL_TIME:
            self.queue.pusp(weixin_request)

    def get_proxy(self, url='localhsot:5000/random'):
        '''
        在自己的ip池中获取可用的代理ip
        '''
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp.text
            return None
        except Exception as e:
            return None

    def schedule(self):
        '''
        调度策略
        '''
        while not self.queue.empty():
            # 队列不为空
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print ('Schedule: {}'.format(weixin_request.url))
            response = self.request(weixin_request)
            if response and response.status_code == 200:
                results = list(callback(response))    # gererator
                if results:
                    for result in results:
                        if isinstance(result, WeiXinRequest):
                            self.queue.pusp()
                        if isinstance(result, dict):
                            self.mysql.insert(result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, weixin_request: WeiXinRequest):
        '''
        执行请求
        '''
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                            'http': proxy,
                            'https': proxy,
                    }
                    return self.session.send(weixin_request.prepare(),
                            timeout=weixin_request.timeout, allow_redirects=False,
                            proxies=proxies)
            return self.session.send(weixin_request.prepare(), 
                    timeout=weixin_request.timeout, allow_redirects=False)
        except (ConnectionError, ReadTimeout):
            return False

    def run(self):
        self.start()
        self.schedule()
Ejemplo n.º 13
0
class Spider(object):
    def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None):
        self.url = url
        self.key_word = key_word
        self.headers = {
            'Accept':
            'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Host':
            'weixin.sogou.com',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
        self.session = Session()
        self.queue = RedisQueue()
        self.mysql = MySQL()

    def start(self):
        '''
        初始化工作
        '''
        self.session.headers.update(self.headers)
        start_url = self.url + '?' + urlencode({
            'query': self.key_word,
            'type': 2
        })
        weixin_request = WeiXinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        self.queue.pusp(weixin_request)

    def parse_index(self, response):
        '''
        页面解析
        '''
        doc = pq(response.text)
        items = doc('.news-box .news-list li .txt-box h3 a').items()
        for item in items:
            url = item.attr('href')
            weixin_request = WeiXinRequest(url=url,
                                           callback=self.parse_detail,
                                           headers=self.headers)
            yield weixin_request
        # 下一页链接
        next = doc('#sogou_next').attr('href')
        if next:
            url = self.url + str(next)
            weixin_request = WeiXinRequest(url=url,
                                           callback=self.parse_index,
                                           headers=self.headers)
            yield weixin_request

    def parse_detail(self, response):
        '''
        解析详情
        '''
        doc(response.text)
        data = {
            'title':
            doc('.rich_media_title').text(),
            'content':
            doc('rich_medai_content').text(),
            'data':
            doc('#post-date').text(),
            'nickname':
            doc('#js_profile_qrcode > div > strong').text(),
            'wechat':
            doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(),
        }
        yield data

    def error(self, weixin_request: WeiXinRequest):
        weixin_request.fail_times += 1
        print('Request Faile: {}, Url is{}'.format(weixin_request.fail_times,
                                                   weixin_request.url))
        if weixin_request.fail_times < MAX_FAIL_TIME:
            self.queue.pusp(weixin_request)

    def get_proxy(self, url='localhsot:5000/random'):
        '''
        在自己的ip池中获取可用的代理ip
        '''
        try:
            resp = requests.get(url)
            if resp.status_code == 200:
                return resp.text
            return None
        except Exception as e:
            return None

    def schedule(self):
        '''
        调度策略
        '''
        while not self.queue.empty():
            # 队列不为空
            weixin_request = self.queue.pop()
            callback = weixin_request.callback
            print('Schedule: {}'.format(weixin_request.url))
            response = self.request(weixin_request)
            if response and response.status_code == 200:
                results = list(callback(response))  # gererator
                if results:
                    for result in results:
                        if isinstance(result, WeiXinRequest):
                            self.queue.pusp()
                        if isinstance(result, dict):
                            self.mysql.insert(result)
                else:
                    self.error(weixin_request)
            else:
                self.error(weixin_request)

    def request(self, weixin_request: WeiXinRequest):
        '''
        执行请求
        '''
        try:
            if weixin_request.need_proxy:
                proxy = self.get_proxy()
                if proxy:
                    proxies = {
                        'http': proxy,
                        'https': proxy,
                    }
                    return self.session.send(weixin_request.prepare(),
                                             timeout=weixin_request.timeout,
                                             allow_redirects=False,
                                             proxies=proxies)
            return self.session.send(weixin_request.prepare(),
                                     timeout=weixin_request.timeout,
                                     allow_redirects=False)
        except (ConnectionError, ReadTimeout):
            return False

    def run(self):
        self.start()
        self.schedule()