Beispiel #1
0
 def __init__(self):
     self.base_url = "https://weixin.sogou.com/weixin"
     self.keyword = KEY
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;'
         'q=0.8,application/signed-exchange;v=b3',
         'Accept-Encoding':
         'gzip, deflate, br',
         'Accept-Language':
         'zh-CN,zh;q=0.9',
         'Cache-Control':
         'max-age=0',
         'Connection':
         'keep-alive',
         'Cookie':
         COOKIES,
         'Host':
         'weixin.sogou.com',
         'Upgrade-Insecure-Requests':
         '1',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
         'Chrome/73.0.3683.86 Safari/537.36',
     }
     self.session = Session()
     self.queue = RedisQueue()
     self.mysql = Mysql()
Beispiel #2
0
class RedisQueueWorker(object):
    def __init__(self,
                 redis_mgr,
                 service_name,
                 custom_key,
                 func_name,
                 callback_to_main_thread=False):
        self.service_name = service_name
        self.func_name = func_name
        self.redis_queue = RedisQueue(redis_mgr)
        self.custom_key = custom_key
        self.redis_queue.subscribe(self.service_name, custom_key)
        self.callback_to_main_thread = callback_to_main_thread

    def _real_start(self):
        while True:
            try:
                item = self.redis_queue.get(self.custom_key)
                if self.callback_to_main_thread:
                    IOLoop.instance().add_callback(self.func_name, item)
                else:
                    self.func_name(item)
            except Exception, e:
                logger.warn("start_work error:%s not found msg", e.message)
                time.sleep(1)
def main():
    with RecordsDB() as records_db:
        records_parser = RecordsParser(records_db)

        with CountriesDB() as countries_db:
            queue = RedisQueue(name='jobs',
                               namespace='queue',
                               decode_responses=True)
            job_in_json = queue.wait_and_dequeue()

            while job_in_json is not None:

                job = json.loads(job_in_json)

                country_id = job['country_id']
                country_name = countries_db.get_country_from_id(country_id)
                num_records = job['num_records']

                if country_name is None:
                    raise Exception("Country name cannot be None!")

                records_parser.get_records(country=country_name,
                                           country_id=country_id,
                                           max_records=num_records)

                job_in_json = queue.wait_and_dequeue()
Beispiel #4
0
class RedisPublisher(object):
    def __init__(self, topic, host='localhost', port=6379, db=0):
        self.rqueue = RedisQueue(topic, 1, host=host, port=port, db=db)

    def redis_send_pyobj(self, obj):
        self.rqueue.put_and_trim(pkl.dumps(obj))

    def redis_send(self, msg):
        self.rqueue.put_and_trim(msg)
Beispiel #5
0
def populate_job_queue():
    queue = RedisQueue('jobs')
    with CountriesDB() as countries_db:

        countries = countries_db.get_countries()
        for country in countries:
            job = {'country_id': country[0], 'num_records': 5000}
            job_in_json = json.dumps(job)
            queue.enqueue(job_in_json)
Beispiel #6
0
    def __init__(self, status_queue, config, stop_event):
        super().__init__()

        self.energy_data_queue = RedisQueue('normal')
        self.status_queue = status_queue
        self.reader = self.init_reader()
        self.solar_ip = config['solar_ip']
        self.solar_url = self.solar_ip + config['solar_url']
        self.stop_event = stop_event
        self.console_mode = True if config["console_mode"] == "true" else False
Beispiel #7
0
 def __init__(self):
     self.trigger = False
     self.result_queue = RedisQueue(Config.UP_QUEUE_NAME)
     self.command_queue = RedisQueue(Config.DOWN_QUEUE_NAME)
     self.port = serial.Serial("/dev/ttyS0",
                               9600,
                               parity=serial.PARITY_NONE,
                               stopbits=serial.STOPBITS_ONE,
                               bytesize=serial.EIGHTBITS,
                               timeout=Config.SERIAL_WAIT)
     self.start()
Beispiel #8
0
    def __init__(self, stop_event):
        super().__init__()

        self.energy_data_queue = RedisQueue('normal')
        self.stop_event = stop_event
        self.default_message = self.get_default_message()

        self.total_usage = random.randint(1000, 5000)
        self.total_redelivery = random.randint(1000, 5000)
        self.total_solar = random.randint(1000, 5000)
        self.total_gas = random.randint(1000, 5000)
Beispiel #9
0
class RedisSubscriber(object):
    def __init__(self, topic, host='localhost', port=6379, db=0):
        self.rqueue = RedisQueue(topic, 1, host=host, port=port, db=db)

    def redis_recv_pyobj(self, blocking=True):
        item = self.rqueue.get(isBlocking=blocking)
        if item is None:
            return None
        return pkl.loads(item)

    def redis_recv(self, blocking=True):
        return self.rqueue.get(isBlocking=blocking)
Beispiel #10
0
 def __init__(self,
              redis_mgr,
              service_name,
              custom_key,
              func_name,
              callback_to_main_thread=False):
     self.service_name = service_name
     self.func_name = func_name
     self.redis_queue = RedisQueue(redis_mgr)
     self.custom_key = custom_key
     self.redis_queue.subscribe(self.service_name, custom_key)
     self.callback_to_main_thread = callback_to_main_thread
Beispiel #11
0
    def __init__(self, thread_index, global_network):

        self.thread_index = thread_index
        self.local_network = global_network
        self.game_state = GameState()
        self.local_t = 0

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0

        self.rq = RedisQueue(REDIS_QUEUE_NAME)
        return
Beispiel #12
0
    def __init__(self, time_execution_in_sec, chart_title, slave, *args,
                 **kwargs):
        super(MyTaskSet, self).__init__(time_execution_in_sec, chart_title,
                                        slave, *args, **kwargs)
        self.running = True
        self.slave = slave
        self.code = None

        self.queue_chart = RedisQueue(name="data_chart",
                                      namespace="data_chart")
        self.queue_tasks = RedisQueue(name="data_tasks",
                                      namespace="data_tasks")
        self.chart = ReportCharts(time_execution_in_sec, chart_title,
                                  self.slave)
        self.db = create_engine(self.config["database"]["db_string"])
Beispiel #13
0
    def __init__(self, status_queue, stop_event, config):
        super(Sender, self).__init__()

        self.normal_data_queue = RedisQueue('normal')
        self.retry_data_queue = RedisQueue('retry')
        self.status_queue = status_queue
        self.stop_event = stop_event

        self.base_url = config["api_url"]
        self.key = config["key"]
        self.store_energy_url = self.base_url + "/v2/energy"
        self.backup_file = "backup"
        self.console_mode = True if config["console_mode"] == "true" else False

        self.connected = False
Beispiel #14
0
class WebServerClass(BaseHTTPRequestHandler):
    def __init__(self, *args, **kwargs):
        self.singleton = Singleton()
        self.queue_chart = RedisQueue(name="data_chart",
                                      namespace="data_chart")
        super(WebServerClass, self).__init__(*args, **kwargs)

    def _set_headers(self):
        self.send_response(200)
        self.send_header('Content-type', 'text/html')
        self.end_headers()

    def do_GET(self):

        if self.path == "/":
            self.path = self.singleton.template_path

        self._set_headers()

        f = open(self.singleton.template_path, "rb")
        self.wfile.write(f.read())
        f.close()
        while True:
            data = self.queue_chart.get().decode("utf-8")
            data = data.replace("\'", "\"")
            data = json.loads(data)
            if data:
                self.wfile.write(
                    "<script type=\"text/javascript\">AddDataChart(chart_id={}, data={});</script>"
                    .format(  # noqa
                        data["chart_id"],
                        data["data"]).encode(encoding='utf_8'))
        return
Beispiel #15
0
def main():
    comment_queue = RedisQueue('reddit-book-stream',
                               host=REDIS_HOST,
                               port=REDIS_PORT)
    reddit = praw.Reddit(user_agent=USER_AGENT,
                         client_id=CLIENT_ID,
                         client_secret=CLIENT_SECRET,
                         username=USERNAME,
                         password=PASSWORD)

    # stream comments from r/all, pasue_after < 0 allows faster streaming
    for comment in reddit.subreddit('all').stream.comments(pause_after=-1):
        if comment and mentions_book(comment.body):
            comment_queue.put(comment.id)
            print(comment.id)
            print(f'reddit.com/api/info?id=t1_{comment.id}')
Beispiel #16
0
def main():
    rq = RedisQueue('reddit-book-stream', host=REDIS_HOST, port=REDIS_PORT)
    reddit = praw.Reddit(user_agent=USER_AGENT,
                         client_id=CLIENT_ID,
                         client_secret=CLIENT_SECRET,
                         username=USERNAME,
                         password=PASSWORD)
Beispiel #17
0
    def __init__(self, url):
        self.queue = RedisQueue('zhihu', host='localhost', port=6379, db=0)
        self.url = url
        self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                        , "Host":"www.zhihu.com"
                        , "Refer":"www.zhihu.com"
                        , "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4"
                        , "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
                        , "Accept-Encoding":"gzip, deflate, sdch, br"
                        , "Cache-Control":"max-age=0"
                        , "Connection":"keep-alive"}

        #cookie
        self.cookies={"_zap":"aaf2a75d-0a1b-4863-b8a0-23ff0f4a9002"
                    , "_za":"e73a8db5-0824-4c36-b6a2-7a5378a046f7"
                    , "udid":'"AFAAY31blAmPTta9QIqu7S6lUdEK97RWDgg=|1457941793"'
                    , "d_c0":'"AGBAzqyTowmPTpYh7UrYZSjcr43LFX006Tw=|1461248461"'
                    , "_zap":"267bc327-098d-4d7c-85cb-3cfd13cd2e8e"
                    , "q_c1":"3b3a3dccecf1499ea32a0b2da9be35ec|1470149980000|1445741536000"
                    , "_xsrf":"8a812fd7745e54a8e8ab4ed815fa9001"
                    , "l_cap_id":'"YzQ3YzNhNzUxZjBlNDAzNTgwM2FhNzdlODI5NjAxZjY=|1472298711|d67a5a1c7e5fb41cfe2715e389c74ebc6132007d"'
                    , "cap_id":'"ZGQwYTE0MTM3ODk0NDUzOGFkM2RiNGYxYTNmYTc1YTM=|1472298711|8fd9f406e4786a9ca56227b61e7c6a2a5c0f4b42"'
                    , "login":'******'
                    , "n_c":'1'
                    , "s-t":"autocomplete"
                    , "s-q":"volley%2Cretrofit%2Cokhttp"
                    , "s-i":"1"
                    , "sid":"6vahoruo"
                    , "a_t":'"2.0AEAAukjbcgoXAAAATjPpVwBAALpI23IKAGBAzqyTowkXAAAAYQJVTfYL6VcAoZ3PJyuvTIR4Yl3RS9B_tCnMwHxnX7iDfjl2Ve7xk-Nk6RdV68h4_A=="'
                    , "z_c0":"Mi4wQUVBQXVramJjZ29BWUVET3JKT2pDUmNBQUFCaEFsVk45Z3ZwVndDaG5jOG5LNjlNaEhoaVhkRkwwSC0wS2N6QWZB|1472308814|21bb41cc3844239f4582374fc850ced4a5e8c564"
                    , "__utma":"51854390.226515891.1472287250.1472298703.1472307196.4"
                    , "__utmc":"51854390"
                    , "__utmz":"51854390.1472296126.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)"
                    , "__utmv":"51854390.100--|2=registration_date=20160827=1^3=entry_date=20151025=1"}
Beispiel #18
0
    def __init__(self):
        self.command_queue = RedisQueue(Config.DOWN_QUEUE_NAME)
        self.result_queue = RedisQueue(Config.UP_QUEUE_NAME)

        self.socket = websocket.WebSocketApp(HOST,
                                             on_open=self.on_open,
                                             on_message=self.on_message,
                                             on_error=self.on_error,
                                             on_close=self.on_close)

        while True:
            try:
                self.socket.run_forever(ping_interval=100)
            except:
                pass
            time.sleep(5)
Beispiel #19
0
class SerialWorker:
    def __init__(self):
        self.trigger = False
        self.result_queue = RedisQueue(Config.UP_QUEUE_NAME)
        self.command_queue = RedisQueue(Config.DOWN_QUEUE_NAME)
        self.port = serial.Serial("/dev/ttyS0",
                                  9600,
                                  parity=serial.PARITY_NONE,
                                  stopbits=serial.STOPBITS_ONE,
                                  bytesize=serial.EIGHTBITS,
                                  timeout=Config.SERIAL_WAIT)
        self.start()

    def start(self):
        while True:
            self.executeTask()
            time.sleep(Config.SERIAL_CYC)

    def executeTask(self):
        GPIO.output(Config.EN_485, GPIO.HIGH)

        command = self.command_queue.get_nowait()
        if not command:
            self.trigger = not self.trigger
            if self.trigger:
                command = DEFAULT_COMMAND
            else:
                command = DEFAULT_COMMAND2

        print 'write to 485 %s' % command

        command = CommandHelper.toWriteable(command)
        self.port.write(command)

        while self.port.out_waiting > 0:
            time.sleep(0.01)

        GPIO.output(Config.EN_485, GPIO.LOW)
        result = self.port.readall()

        if result:
            result = CommandHelper.toReadable(result)
            print 'receive from 485 %s' % result
            self.result_queue.put(result)
Beispiel #20
0
    def __init__(self):
        self.device = '/gpu:0' if USE_GPU else '/cpu:0'
        self.stop_requested = False
        self.global_t = 0
        if USE_LSTM:
            self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1)
        else:
            self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device)
        self.global_network.create_loss(ENTROPY_BETA)

        self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE)
        print 'initial_learning_rate:', self.initial_learning_rate
        self.learning_rate_input = tf.placeholder('float')
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_input,
                                                   decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON)

        grads_and_vars = self.optimizer.compute_gradients(
            self.global_network.total_loss, self.global_network.get_vars())
        self.apply_gradients = self.optimizer.apply_gradients(grads_and_vars)

        self.actor_threads = []
        for i in range(PARALLEL_SIZE):
            actor_thread = A3CActorThread(i, self.global_network)
            self.actor_threads.append(actor_thread)

        self.sess = tf.InteractiveSession()
        self.sess.run(tf.initialize_all_variables())

        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)

        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)

        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph)

        self.saver = tf.train.Saver()
        self.restore()

        self.lock = threading.Lock()
        self.rq = RedisQueue(REDIS_QUEUE_NAME)
        self.train_count = 0
        return
class RedisQueueSender(object):
    executor = ThreadPoolExecutor(2)

    def __init__(self, redis_mgr):
        self.redis_mgr = redis_mgr
        self.queue = RedisQueue(redis_mgr)

    @run_on_executor
    def send(self, service_name, data):
        return self.queue.send_msg(service_name, data)
Beispiel #22
0
 def __init__(self, name, collector):
     if SCHEDULER_PERSIST:  # 如果使用分布式或者是持久化,使用redis的队列
         self.queue = RedisQueue(name=name)
         self._filter_container = RedisFilterContainer(
         )  # 使用redis作为python的去重的容器
     else:
         self.queue = Queue()
         self._filter_container = NoramlFilterContainer(
         )  # 使用Python的set()集合
     # 统计重复的数量
     self.collector = collector
Beispiel #23
0
async def main():
    msg = "stockx 爬虫 Starting!"
    print(msg)
    logging.info(msg)

    q = RedisQueue('rq')

    # 建立 client request
    async with aiohttp.ClientSession() as client:
        for k, v in URL.items():
            for page in range(1, 25):
                api_url = DOMAIN + v + str(page)
                task = asyncio.create_task(spiderList(client, api_url, q))
                await asyncio.sleep(10)

        done, pending = await asyncio.wait({task})
        if task in done:
            print('[爬取完成]所有爬取进程已经全部完成')
            logging.info("[爬取完成]所有爬取进程已经全部完成")
 def test_failure(self):
     qkey = 'test:failure:queue'
     tid = '12'
     rq = RedisQueue(self.rc, 1, 2)
     self.rc.lpush(qkey, tid)
     tid0 = rq.safe_pop(qkey)
     self.assertEqual(tid, tid0)
     ## Popping another task too fast, before the task timeout has been
     ## reached.
     tid1 = rq.safe_pop(qkey)
     self.assertIsNone(tid1)
     ## Supposing the worker had died before finishing the task, we can take
     ## it again after the task timeout.
     time.sleep(2)
     tid2 = rq.safe_pop(qkey)
     self.assertEqual(tid, tid2)
     ## Marking the task as done should make impossible to retrieve the same
     ## task.
     rq.mark_done(qkey, tid2)
     time.sleep(2)
     tid3 = rq.safe_pop(qkey)
     self.assertIsNone(tid3)
     self.rc.delete(qkey, "%s:done" % qkey)
Beispiel #25
0
async def main(loop):
    print("开始爬虫")
    # 等待mysql连接好
    pool = await aiomysql.create_pool(host=conf.database['host'],
                                      port=conf.database['port'],
                                      user=conf.database['user'],
                                      password=conf.database['passwd'],
                                      db=conf.database['db'],
                                      loop=loop)

    q = RedisQueue('rq')

    for k, v in URL.items():
        for page in range(30):
            api_url = DOMAIN + v + str(page)
            task = asyncio.create_task(spiderList(pool, api_url, q))
            await asyncio.sleep(1)

    done, pending = await asyncio.wait({task})
    if task in done:
        print('[爬取完成]所有爬取进程已经全部完成')
        logging.info("[爬取完成]所有爬取进程已经全部完成")
Beispiel #26
0
class A3CActorThread(object):
    def __init__(self, thread_index, global_network):

        self.thread_index = thread_index
        self.local_network = global_network
        self.game_state = GameState()
        self.local_t = 0

        # for log
        self.episode_reward = 0.0
        self.episode_start_time = 0.0
        self.prev_local_t = 0

        self.rq = RedisQueue(REDIS_QUEUE_NAME)
        return

    def choose_action(self, policy_output):
        if random.random() < RANDOM_ACTION_PROBILITY:
            return random.randint(0, ACTION_DIM - 1)

        values = []
        sum = 0.0
        for rate in policy_output:
            sum += rate
            values.append(sum)

        r = random.random() * sum
        for i in range(len(values)):
            if values[i] >= r:
                return i
        return len(values) - 1

    def _record_log(self, sess, global_t, summary_writer, summary_op,
                    reward_input, reward, time_input, living_time):
        summary_str = sess.run(summary_op,
                               feed_dict={
                                   reward_input: reward,
                                   time_input: living_time
                               })
        summary_writer.add_summary(summary_str, global_t)
        return

    def process(self, sess, global_t, summary_writer, summary_op, reward_input,
                time_input):
        states = []
        actions = []
        rewards = []
        values = []

        terminal_end = False
        # reduce the influence of socket connecting time
        if self.episode_start_time == 0.0:
            self.episode_start_time = timestamp()

        start_local_t = self.local_t

        for i in range(LOCAL_T_MAX):
            policy_, value_ = self.local_network.run_policy_and_value(
                sess, self.game_state.s_t)
            if self.thread_index == 0 and self.local_t % 1000 == 0:
                print 'policy=', policy_
                print 'value=', value_

            action_id = self.choose_action(policy_)

            states.append(self.game_state.s_t)
            actions.append(action_id)
            values.append(value_)

            self.game_state.process(action_id)
            reward = self.game_state.reward
            terminal = self.game_state.terminal

            self.episode_reward += reward
            rewards.append(reward)

            self.local_t += 1

            # s_t1 -> s_t
            self.game_state.update()

            if terminal:
                terminal_end = True
                episode_end_time = timestamp()
                living_time = episode_end_time - self.episode_start_time

                self._record_log(sess, global_t, summary_writer, summary_op,
                                 reward_input, self.episode_reward, time_input,
                                 living_time)

                print("global_t=%d / reward=%.2f / living_time=%.4f") % (
                    global_t, self.episode_reward, living_time)

                # reset variables
                self.episode_reward = 0.0
                self.episode_start_time = episode_end_time
                self.game_state.reset()
                if USE_LSTM:
                    self.local_network.reset_lstm_state()
                break
            # log
            if self.local_t % 2000 == 0:
                living_time = timestamp() - self.episode_start_time
                self._record_log(sess, global_t, summary_writer, summary_op,
                                 reward_input, self.episode_reward, time_input,
                                 living_time)
        # -----------end of batch (LOCAL_T_MAX)--------------------

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_value(sess, self.game_state.s_t)
        # print ('global_t: %d, R: %f') % (global_t, R)

        states.reverse()
        actions.reverse()
        rewards.reverse()
        values.reverse()

        batch_state = []
        batch_action = []
        batch_td = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + GAMMA * R
            td = R - Vi
            action = np.zeros([ACTION_DIM])
            action[ai] = 1

            batch_state.append(si)
            batch_action.append(action)
            batch_td.append(td)
            batch_R.append(R)

            # put in into redis queue for asychronously train
            data = cPickle.dumps((si, action, td, R))
            self.rq.put(data)

        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Beispiel #27
0
# -*- coding:utf-8 -*-
__author__ = '张全亮'
import requests
import urllib3
import math
import time
import datetime
from multiprocessing.dummy import Pool

import hashlib

urllib3.disable_warnings()
from logger import Logger
from redis_queue import RedisQueue

yz = RedisQueue('yz')
yz_rec = RedisQueue('yz_rec')
logger = Logger()
"""校验订单规则,每页查找订单,找到符合条件的结束翻页查找"""


def check_pay(order_sn, pdduid, kdtsessionid):
    cookie = 'KDTSESSIONID={}'.format(kdtsessionid)
    firsr_url = 'https://h5.youzan.com/v2/trade/order/list.json?perpage=20&page=1&type=all'
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36",
        'Cookie': cookie
    }
    res = requests.get(firsr_url, headers=headers, verify=False)
    if '页面已被删除' in res.text:
Beispiel #28
0
#!/usr/bin/env python
# coding:utf-8
# Copyright (C) dirlt

from redis_queue import RedisQueue

command_queue = RedisQueue('command')
command_queue.put('trigger')
Beispiel #29
0
import datetime
from mysql_db import db_insert
from flask import Flask, jsonify, request, redirect, render_template
from redis_queue import RedisQueue

app = Flask(__name__)

# 下单部分
from pdd_spider import pdd_main
from yz_spider import yz_main

# 查询部分
from pdd_query import pdd_pass_query
from yz_query import yz_pass_query

pdd = RedisQueue('pdd')
yz = RedisQueue('yz')

"""拼多多下单爬虫"""


def pdd_spider(pdduid, accesstoken, goods_url, amount, order_number):
    result = pdd_main(pdduid, accesstoken, goods_url, amount, order_number)
    return result


"""有赞下单爬虫"""


def yz_spider(pdduid, kdtsessionid, goods_url, amount, order_number):
    result = yz_main(pdduid, kdtsessionid, goods_url, amount, order_number)
def add_job_to_queue(country_id, num_records):
    queue = RedisQueue('jobs')
    job = {'country_id': country_id, 'num_records': num_records}
    job_in_json = json.dumps(job)
    queue.enqueue(job_in_json)
Beispiel #31
0
class A3C(object):

    def __init__(self):
        self.device = '/gpu:0' if USE_GPU else '/cpu:0'
        self.stop_requested = False
        self.global_t = 0
        if USE_LSTM:
            self.global_network = A3CLSTMNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device, -1)
        else:
            self.global_network = A3CFFNetwork(STATE_DIM, STATE_CHN, ACTION_DIM, self.device)
        self.global_network.create_loss(ENTROPY_BETA)

        self.initial_learning_rate = log_uniform(INITIAL_ALPHA_LOW, INITIAL_ALPHA_HIGH, INITIAL_ALPHA_LOG_RATE)
        print 'initial_learning_rate:', self.initial_learning_rate
        self.learning_rate_input = tf.placeholder('float')
        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate_input,
                                                   decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON)

        grads_and_vars = self.optimizer.compute_gradients(
            self.global_network.total_loss, self.global_network.get_vars())
        self.apply_gradients = self.optimizer.apply_gradients(grads_and_vars)

        self.actor_threads = []
        for i in range(PARALLEL_SIZE):
            actor_thread = A3CActorThread(i, self.global_network)
            self.actor_threads.append(actor_thread)

        self.sess = tf.InteractiveSession()
        self.sess.run(tf.initialize_all_variables())

        self.reward_input = tf.placeholder(tf.float32)
        tf.scalar_summary('reward', self.reward_input)

        self.time_input = tf.placeholder(tf.float32)
        tf.scalar_summary('living_time', self.time_input)

        self.summary_op = tf.merge_all_summaries()
        self.summary_writer = tf.train.SummaryWriter(LOG_FILE, self.sess.graph)

        self.saver = tf.train.Saver()
        self.restore()

        self.lock = threading.Lock()
        self.rq = RedisQueue(REDIS_QUEUE_NAME)
        self.train_count = 0
        return

    def restore(self):
        checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print("checkpoint loaded:", checkpoint.model_checkpoint_path)
            tokens = checkpoint.model_checkpoint_path.split("-")
            # set global step
            self.global_t = int(tokens[1])
            print(">>> global step set: ", self.global_t)
        else:
            print("Could not find old checkpoint")
        return

    def backup(self):
        if not os.path.exists(CHECKPOINT_DIR):
            os.mkdir(CHECKPOINT_DIR)

        self.saver.save(self.sess, CHECKPOINT_DIR + '/' + 'checkpoint', global_step=self.global_t)
        return

    def predict_function(self, parallel_index, lock):
        actor_thread = self.actor_threads[parallel_index]
        while True:
            if self.stop_requested or (self.global_t > MAX_TIME_STEP):
                break
            diff_global_t = actor_thread.process(
                self.sess, self.global_t,
                self.summary_writer, self.summary_op,
                self.reward_input, self.time_input
            )

            self.global_t += diff_global_t
            if self.global_t % 1000000 < LOCAL_T_MAX:
                self.backup()
            # print 'global_t:', self.global_t
        return

    def train_function(self, index, lock):
        batch_state = []
        batch_action = []
        batch_td = []
        batch_R = []

        while True:
            if self.stop_requested or (self.global_t > MAX_TIME_STEP):
                break
            data = self.rq.get()
            (state, action, td, R) = cPickle.loads(data)

            batch_state.append(state)
            batch_action.append(action)
            batch_td.append(td)
            batch_R.append(R)

            if len(batch_R) < BATCH_SIZE:
                continue

            lock.acquire()
            self.sess.run(self.apply_gradients, feed_dict={
                self.global_network.state_input: batch_state,
                self.global_network.action_input: batch_action,
                self.global_network.td: batch_td,
                self.global_network.R: batch_R,
                self.learning_rate_input: self.initial_learning_rate
            })
            self.train_count += 1
            lock.release()

            batch_state = []
            batch_action = []
            batch_td = []
            batch_R = []

            print 'train_index:', index, 'train_count:', self.train_count
        return

    def signal_handler(self, signal_, frame_):
        print 'You pressed Ctrl+C !'
        self.stop_requested = True
        return

    def run(self):
        predict_treads = []
        for i in range(PARALLEL_SIZE):
            predict_treads.append(threading.Thread(target=self.predict_function, args=(i, self.lock)))

        signal.signal(signal.SIGINT, self.signal_handler)

        for t in predict_treads:
            t.start()

        train_threads = []
        for i in range(TRAIN_SIZE):
            train_threads.append(threading.Thread(target=self.train_function, args=(i, self.lock)))
            train_threads[i].start()

        print 'Press Ctrl+C to stop'
        signal.pause()

        print 'Now saving data....'
        for t in predict_treads:
            t.join()
        for t in train_threads:
            t.join()

        self.backup()
        return
Beispiel #32
0
class Zhihu_crawler():

    def __init__(self, url):
        self.queue = RedisQueue('zhihu', host='localhost', port=6379, db=0)
        self.url = url
        self.headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                        , "Host":"www.zhihu.com"
                        , "Refer":"www.zhihu.com"
                        , "Accept-Language":"zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4"
                        , "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
                        , "Accept-Encoding":"gzip, deflate, sdch, br"
                        , "Cache-Control":"max-age=0"
                        , "Connection":"keep-alive"}

        #cookie
        self.cookies={"_zap":"aaf2a75d-0a1b-4863-b8a0-23ff0f4a9002"
                    , "_za":"e73a8db5-0824-4c36-b6a2-7a5378a046f7"
                    , "udid":'"AFAAY31blAmPTta9QIqu7S6lUdEK97RWDgg=|1457941793"'
                    , "d_c0":'"AGBAzqyTowmPTpYh7UrYZSjcr43LFX006Tw=|1461248461"'
                    , "_zap":"267bc327-098d-4d7c-85cb-3cfd13cd2e8e"
                    , "q_c1":"3b3a3dccecf1499ea32a0b2da9be35ec|1470149980000|1445741536000"
                    , "_xsrf":"8a812fd7745e54a8e8ab4ed815fa9001"
                    , "l_cap_id":'"YzQ3YzNhNzUxZjBlNDAzNTgwM2FhNzdlODI5NjAxZjY=|1472298711|d67a5a1c7e5fb41cfe2715e389c74ebc6132007d"'
                    , "cap_id":'"ZGQwYTE0MTM3ODk0NDUzOGFkM2RiNGYxYTNmYTc1YTM=|1472298711|8fd9f406e4786a9ca56227b61e7c6a2a5c0f4b42"'
                    , "login":'******'
                    , "n_c":'1'
                    , "s-t":"autocomplete"
                    , "s-q":"volley%2Cretrofit%2Cokhttp"
                    , "s-i":"1"
                    , "sid":"6vahoruo"
                    , "a_t":'"2.0AEAAukjbcgoXAAAATjPpVwBAALpI23IKAGBAzqyTowkXAAAAYQJVTfYL6VcAoZ3PJyuvTIR4Yl3RS9B_tCnMwHxnX7iDfjl2Ve7xk-Nk6RdV68h4_A=="'
                    , "z_c0":"Mi4wQUVBQXVramJjZ29BWUVET3JKT2pDUmNBQUFCaEFsVk45Z3ZwVndDaG5jOG5LNjlNaEhoaVhkRkwwSC0wS2N6QWZB|1472308814|21bb41cc3844239f4582374fc850ced4a5e8c564"
                    , "__utma":"51854390.226515891.1472287250.1472298703.1472307196.4"
                    , "__utmc":"51854390"
                    , "__utmz":"51854390.1472296126.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)"
                    , "__utmv":"51854390.100--|2=registration_date=20160827=1^3=entry_date=20151025=1"}

    def send_request(self):
        #关注者的url
        followees_url = self.url + '/followees'

        session = requests.session()
        session.proxies = {
            "http": "http://124.88.67.17.251:8685",
            "https": "http://223.67.136.218:8920",
        }

        #发起请求
        #避免Https的证书验证

        r = requests.get(followees_url, cookies = self.cookies, headers = self.headers, verify = True)

        try:
            r.raise_for_status()
        except requests.HTTPError as e:
            print e.message + ' HttpError'
        except requests.ConnectionError as e:
            print e.message

        content = r.text
        if r.status_code == requests.codes.ok:
            self.parse_users_content(content)
            print "requests success!"

    #判断是否数据存在
    def judge_data_have(self, name, datas):
        if datas:
            #print datas[0]
            return datas[0]
        else:
            #print name + " not exist!"
            return ''

    #解析数据
    def parse_users_content(self, html_source):
        #初始化我们需要的信息变量
        self.user_name=''
        self.user_gender=''
        self.user_location=''
        self.user_followees=''
        self.user_followers=''
        self.user_be_agreed=''
        self.user_be_thanked=''
        self.user_education_school=''
        self.user_education_subject=''
        self.user_employment=''
        self.user_employment_extra=''
        self.user_intro=''
        self.followees_urls=''

        tree = etree.HTML(html_source)

        self.user_name = self.judge_data_have("姓名", tree.xpath('//a[@class = "name"]/text()'))
        self.user_location = self.judge_data_have("位置", tree.xpath('//span[@class = "location item"]/@title'))
        self.user_gender = self.judge_data_have("性别", tree.xpath('//span[@class = "item gender"]/i/@class'))
        if self.user_gender:
            if 'female' in self.user_gender:
                self.user_gender = 'female'
            elif 'male' in self.user_gender:
                self.user_gender = 'male'

        followees = tree.xpath('//div[@class = "zu-main-sidebar"]//strong/text()')
        if followees:
            self.user_followees = tree.xpath('//div[@class = "zu-main-sidebar"]//strong/text()')[0]
            self.user_followers = tree.xpath('//div[@class = "zu-main-sidebar"]//strong/text()')[1]

        stats = tree.xpath('//div[@class = "zm-profile-header-info-list"]//strong/text()')
        if stats:
            self.user_be_agreed = tree.xpath('//div[@class = "zm-profile-header-info-list"]//strong/text()')[0]
            self.user_be_thanked = tree.xpath('//div[@class = "zm-profile-header-info-list"]//strong/text()')[1]

        self.user_education_school = self.judge_data_have("学校", tree.xpath('//span[@class = "education item"]/a/@title'))
        self.user_education_subject = self.judge_data_have("学科", tree.xpath('//span[@class = "education-extra item"]/a/@title'))
        self.user_employment = self.judge_data_have("公司", tree.xpath('//span[@class = "employment item"]/@title'))
        self.user_employment_extra = self.judge_data_have("公司", tree.xpath('//span[@class = "position item"]/@title'))
        self.user_intro = self.judge_data_have("简介", tree.xpath('//div[@class = "bio ellipsis"]/@title'))

        #添加到队列里面
        self.followees_urls = tree.xpath('//a[@class = "zg-link author-link"]/@href')
        for url in self.followees_urls:
            #url = url.replace("https", "http")
            self.queue.put(url)

        self.print_data_out()

        #打印最终信息
    def print_data_out(self):
        print "*"*60
        print "用户名:%s".decode('utf-8') % self.user_name
        print "用户性别:%s".decode('utf-8') % self.user_gender
        print "用户地址:%s".decode('utf-8') % self.user_location
        print "被同意:%s".decode('utf-8') % self.user_be_agreed
        print "被感谢:%s".decode('utf-8') % self.user_be_thanked
        print "被关注:%s".decode('utf-8') % self.user_followers
        print "关注了:%s".decode('utf-8') % self.user_followees
        print "工作:%s/%s".decode('utf-8') % (self.user_employment,self.user_employment_extra)
        print "教育:%s/%s".decode('utf-8') % (self.user_education_school,self.user_education_subject)
        print "用户信息:%s".decode('utf-8') % self.user_intro
        print "*"*60

        self.save_in_mongodb()

    #存储到mongodb数据库里面
    def save_in_mongodb(self):
        new_data = Zhihu_User_Data(
            user_name = self.user_name,
            user_gender = self.user_gender,
            user_location = self.user_location,
            user_followees = self.user_followees,
            user_followers = self.user_followers,
            user_be_agreed = self.user_be_agreed,
            user_be_thanked = self.user_be_thanked,
            user_education_school = self.user_education_school,
            user_education_subject = self.user_education_subject,
            user_employment = self.user_employment,
            user_employment_extra = self.user_employment_extra,
            user_intro = self.user_intro,
            followees_urls = self.followees_urls
        )
        new_data.save()


    #返回队列
    def get_queue(self):
        return self.queue
Beispiel #33
0
def push_to_queue(queue_name, items):
    queue = RedisQueue(queue_name)
    for item in items:
        queue.put(item)