Esempio n. 1
0
    def create_book(self, command, counter):
        Path.reset_path()
        Debug.logger.info(u"开始制作第 {} 本电子书".format(counter))
        Debug.logger.info(u"对记录 {} 进行分析".format(command))
        task_list = CommandParser.get_task_list(command)  # 分析命令

        if len(task_list) == 0:
            return

        for task in task_list:
            if Config.debug_for_create_book:
                pass
            else:
                Worker.distribute(task)
        Debug.logger.info(u"网页信息抓取完毕")

        task_result_list = []
        for task in task_list:
            task_result = TaskResult(task)
            task_result.extract_data()
            task_result_list.append(task_result)
        Debug.logger.info(u"数据库信息获取完毕")

        #   下载图片
        for task_result in task_result_list:
            task_result.download_img()
        Debug.logger.info(u"所有任务图片获取完毕")

        #   按体积自动分卷
        #   渲染html && 压缩为电子书
        book = Book(task_result_list)
        book_list = book.auto_split(Config.max_book_size_mb * 1024)
        for chapter in book_list:
            chapter.create_book()
        return
Esempio n. 2
0
def test_initialization():
    config = Config.from_environ()
    config['Blockchain']['GasPrice'] = 'fast'

    worker = Worker(config)
    worker.web3.eth.setGasPriceStrategy.assert_called_once_with(
        fast_gas_price_strategy)
    assert worker.dynamic_gas_price_strategy
    assert worker.gas_price == worker.web3.eth.generateGasPrice()

    config['Blockchain']['GasPrice'] = 'medium'
    worker = Worker(config)
    worker.web3.eth.setGasPriceStrategy.assert_called_once_with(
        medium_gas_price_strategy)
    assert worker.dynamic_gas_price_strategy
    assert worker.gas_price == worker.web3.eth.generateGasPrice()

    config['Blockchain']['GasPrice'] = 6000000
    worker = Worker(config)
    assert not worker.dynamic_gas_price_strategy
    assert worker.gas_price == config['Blockchain']['GasPrice']
    worker.web3.eth.setGasPriceStrategy.assert_not_called()

    config['Blockchain']['GasPrice'] = 'slow'
    with pytest.raises(Exception) as einfo:
        worker = Worker(config)
    assert str(einfo.value) == 'Invalid gas price strategy:\'slow\''

    config['Blockchain']['GasPrice'] = None
    with pytest.raises(Exception) as einfo:
        worker = Worker(config)
    assert str(einfo.value) == 'Invalid gas price strategy:None'
    def catch(account_id):
        # 关键就在这里了
        u"""

        :param target_url: https://xueqiu.com/4065977305
        :return:
        """
        mock_sleep_time = 0.5

        article_url_index_list = []
        #   获取最大页码
        # url = 'http://chuansong.me/account/{}'.format(account_id)
        # front_page_content = Http.get_content(url)
        # max_page = XueQiuWorker.parse_max_page(front_page_content)

        column_info = {}
        column_info[u'column_id'] = account_id
        column_info[u'title'] = ""
        column_info['article_count'] = 0
        column_info['follower_count'] = 0
        column_info['description'] = ''
        column_info['image_url'] = ''

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        strtT = '1558513651020'

        # https://api.wallstreetcn.com/apiv1/content/themes/stream/1005680?type=newest&cursor=1558066610478&limit=20

        max_page = 2
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(1, max_page):
            resuorcecatch(account_id, strtT)
    def test23hostPermissions(self):
        worker = Worker()
        self.source += 'test23hostPermissions'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 3
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        manifest = worker.wrapper.manifest

        key = 'permissions'
        self.assertIn(key, manifest)
        self.assertEqual(len(manifest[key]), 2)

        key = 'optional_permissions'
        self.assertIn(key, manifest)
        self.assertEqual(len(manifest[key]), 1)

        key = 'host_permissions'
        self.assertIn(key, manifest)
        self.assertEqual(len(manifest[key]), 2)

        shutil.rmtree(self.destination)
Esempio n. 5
0
 def distribute(self, files: List[str]) -> None:
     with Manager() as manager:
         worker = Worker(self.sender_factory, len(files), self.queue,
                         manager.Value('i', 0), manager.Value('i', 0))
         with Pool(self.processes) as pool:
             pool.map(worker.upload, files)
         self.active.value = False
         worker.notify('')
Esempio n. 6
0
 def __init__(self):
     self.log = Logger.get_logger_instance()
     self.__config = Config.get_config_instance()
     self.__worker = Worker()
     self.queue_handler = TaskHandler(
         self.__config["queue"]["job_type"],
         self.__config["queue"]["task_type"],
         self.__config["queue"]["job_manager_url"],
         self.__config["queue"]["heartbeat_manager_url"],
         self.__config["queue"]["heartbeat_interval_seconds"], self.log)
    def test_mv3_to_mv2_B(self):
        worker = Worker()
        self.source += 'tabstourls_mv3'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 2
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        shutil.rmtree(self.destination)
Esempio n. 8
0
def test(unprocessed_queue, unprocessed_bucket, issued_bucket, unwrap, wrap):
    config = Config.from_environ()
    document_store_address = config['DocumentStore']['Address']
    # overriding safe VisibilityTimeout
    config['Worker']['Polling']['VisibilityTimeout'] = 1
    queue_test_wait_time_seconds = config['Worker']['Polling'][
        'VisibilityTimeout'] * 2

    document_v2 = DOCUMENT_V2_TEMPLATE.substitute(
        DocumentStoreAddress=document_store_address)
    document_v2 = json.loads(document_v2)
    wrapped_document_v2 = wrap(document_v2, '2.0')

    document_v3 = DOCUMENT_V3_TEMPLATE.substitute(
        DocumentStoreAddress=document_store_address)
    document_v3 = json.loads(document_v3)
    wrapped_document_v3 = wrap(document_v3, '3.0')

    worker = Worker(config)
    index = 1
    # checking both schema versions to test auto version definition
    for document in [document_v2, document_v3]:
        key = f'document-{index}'
        unprocessed_bucket.Object(key).put(Body=json.dumps(document))
        worker.poll()
        issued_document = json.load(issued_bucket.Object(key).get()['Body'])
        assert unwrap(issued_document) == document
        index += 1
        time.sleep(queue_test_wait_time_seconds)
    index = 1
    # checking both schema versions to test auto version definition for wrapped documents
    for document in [wrapped_document_v2, wrapped_document_v3]:
        key = f'wrapped-document-{index}'
        unprocessed_bucket.Object(key).put(Body=json.dumps(document))
        worker.poll()
        issued_document = json.load(issued_bucket.Object(key).get()['Body'])
        assert unwrap(issued_document) == unwrap(document)
        index += 1
        time.sleep(queue_test_wait_time_seconds)
    # check that all messages were processed
    assert not unprocessed_queue.receive_messages(
        WaitTimeSeconds=queue_test_wait_time_seconds,
        MaxNumberOfMessages=1,
        VisibilityTimeout=0)

    # Checking issuing already issued wrapped document
    # it should be moved to issued bucket without calling contract.issue method
    # after signature and document store verifications passed
    key = 'issued-wrapped-document'
    assert worker.is_issued_document(wrapped_document_v2)
    unprocessed_bucket.Object(key).put(Body=json.dumps(wrapped_document_v2))
    worker.poll()
    issued_document = json.load(issued_bucket.Object(key).get()['Body'])
    assert issued_document == wrapped_document_v2
    def test_mv3_to_mv2_C(self):
        worker = Worker()
        self.source += 'timebadge_mv3'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 2
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        manifest = worker.wrapper.manifest
        self.assertIn('background', manifest)
        self.assertIn('scripts', manifest['background'])

        shutil.rmtree(self.destination)
Esempio n. 10
0
 def add_all(self, subject):
     if subject == self.form.ID:
         # use cached unmapped
         unmapped = self._current['unmapped']
         adder = self.wc_adder
     else:
         # use cached wc_unmapped
         unmapped = self._current['wcunmapped']
         adder = self.moein_adder
     # get checked-unmapped ids
     checked = [int(row[0]) for row in self.options_list.getChecked()]
     # filter unmapped by checked
     unmapped = [um for um in unmapped if um['id'] in checked]
     # check for unmapped
     if unmapped:
         pd = Progress(self.options_list, self.messages[8], 0,
                       len(unmapped))
         pd.show()
         worker = Worker(adder, unmapped)
         worker.signals.progress.connect(pd.setValue)
         worker.signals.error.connect(pd.close)
         worker.signals.error.connect(self.add_all_error)
         worker.signals.done.connect(self.add_all_done)
         QThreadPool.globalInstance().start(worker)
         self.options_list.btnAddAll.setDisabled(True)
Esempio n. 11
0
 def run(self):
     for item in self.target:
         logger.debug('start ' + item.__name__)
         worker = Worker(item)
         self.pool.start(worker)
     self.pool.waitForDone()
     logger.debug('thread finished')
Esempio n. 12
0
 def setUp(self) -> None:
     self.manager = Manager()
     self.queue = self.manager.Queue()
     self.done = self.manager.Value('i', 0)
     self.failed = self.manager.Value('i', 0)
     self.worker = Worker(MockSenderFactory(), len(self.files), self.queue,
                          self.done, self.failed)
    def test23contentSecurityPolicy(self):
        worker = Worker()
        self.source += 'test23contentSecurityPolicy'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 3
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        manifest = worker.wrapper.manifest
        key = 'content_security_policy'
        self.assertIn(key, manifest)
        self.assertIn('extension_pages', manifest[key])
        self.assertIn('sandbox', manifest[key])

        shutil.rmtree(self.destination)
Esempio n. 14
0
    def create_book(self, command, counter):
        Path.reset_path()
        Debug.logger.info(u"开始制作第 {} 本电子书".format(counter))
        Debug.logger.info(u"对记录 {} 进行分析".format(command))
        task_list = CommandParser.get_task_list(command)  # 分析命令

        if len(task_list) == 0:
            return

        for task in task_list:
            if Config.debug_for_create_book:
                pass
            else:
                Worker.distribute(task)
        Debug.logger.info(u"网页信息抓取完毕")

        task_result_list = []
        toTo_list = [
            Type.wechat, Type.huxiu, Type.huawei, Type.xueqiu, Type.sina,
            Type.zhengshitang, Type.jinwankansa, Type.wuxia, Type.doc360,
            Type.todo, Type.todo1, Type.todo2, Type.fiel, Type.taoguba_article
        ]
        for task in task_list:
            if task.get_task_type() in toTo_list:
                task = ColumnTask(task.account_id)
            task_result = TaskResult(task)
            task_result.extract_data()
            task_result_list.append(task_result)
        Debug.logger.info(u"数据库信息获取完毕")

        #   下载图片
        for task_result in task_result_list:
            task_result.download_img()
            # print '所有任务图片获取完毕'
        Debug.logger.info(u"所有任务图片获取完毕")

        #   按体积自动分卷
        #   渲染html && 压缩为电子书
        book = Book(task_result_list)
        book_list = book.auto_split(Config.max_book_size_mb * 1024)
        for chapter in book_list:
            chapter.create_book()

        return
    def test23executeScript(self):
        worker = Worker()
        self.source += 'test23executeScript'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 3
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        manifest = worker.wrapper.manifest
        self.assertIn('background', manifest)
        self.assertIn('service_worker', manifest['background'])
        self.assertEqual(manifest['background']['service_worker'],
                         'service_worker.js')

        self.assertIn('permissions', manifest)
        self.assertIn('scripting', manifest['permissions'])

        shutil.rmtree(self.destination)
    def test_mv2_C(self):
        worker = Worker()
        self.source += 'backgroundScripts_mv2'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 3
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        manifest = worker.wrapper.manifest
        self.assertIn('background', manifest)
        self.assertIn('service_worker', manifest['background'])
        self.assertEqual(manifest['background']['service_worker'],
                         'service_worker.js')
        self.assertFalse(
            os.path.exists(worker.wrapper.destination + os.sep + 'script1.js'))
        self.assertFalse(
            os.path.exists(worker.wrapper.destination + os.sep + 'script2.js'))

        shutil.rmtree(self.destination)
Esempio n. 17
0
def main():
    # PBT_Quadratic Environment initialization
    convergenceTolerance = 10e-4  #???
    maxStep = 30
    # updateInterval = 4 # every 4 iteration, do an update
    init_theta = [0.9, 0.9]  # set initial weights
    #create the worker population
    numOfWorkers = 2
    init_hyperParam = [[0, 1], [1, 0]]
    worker_list = [
        Worker(init_theta, init_hyperParam[i]) for i in range(numOfWorkers)
    ]
    run1 = train(worker_list, step, eval, ready, exploit, explore, lossFunc,
                 convergenceTolerance, maxStep)

    # Visualization

    # def plot_value(run, i, steps, title):
    #     plt.subplot(2, 4, i)
    #     plt.plot(run[0].eval_history, color='b', lw=0.7)
    #     plt.plot(run[1].eval_history, color='r', lw=0.7)
    #     plt.axhline(y=1.2, linestyle='dotted', color='k')
    #     axes = plt.gca()
    #     axes.set_xlim([0, steps])
    #     axes.set_ylim([0.0, 1.21])
    #
    #     plt.title(title)
    #     plt.xlabel('Step')
    #     plt.ylabel('Q')
    #     return

    def plot_theta(run, i, steps, title):
        x_b = [_[0] for _ in run[0].theta_history]
        y_b = [_[1] for _ in run[0].theta_history]

        x_r = [_[0] for _ in run[1].theta_history]
        y_r = [_[1] for _ in run[1].theta_history]

        plt.subplot(2, 4, i)
        plt.scatter(x_b, y_b, color='b', s=2)
        plt.scatter(x_r, y_r, color='r', s=2)

        plt.title(title)
        plt.xlabel('theta0')
        plt.ylabel('theta1')
        return

    plot_theta(run1, 1, steps=maxStep, title='PBT')
    plt.show()
Esempio n. 18
0
    def start(self):
        #   检查更新
        self.check_update()

        #   登录
        login = Login()
        zhihu_client = login.get_login_client()
        Worker.set_zhihu_client(zhihu_client)

        Debug.logger.info(u"开始读取ReadList.txt设置信息")

        if not Path.is_file(u'./ReadList.txt'):
            #  当ReadList不存在的时候自动创建之
            with open(u'./ReadList.txt', u'w') as read_list:
                read_list.close()
            print Debug.logger.info(u"ReadList.txt 内容为空,自动退出")
            return
        book_counter = self.read_list()

        Debug.logger.info(u"所有书籍制作完成。")
        Debug.logger.info(u"本次共制作书籍{0}本".format(book_counter))
        Debug.logger.info(u"感谢您的使用")
        Debug.logger.info(u"点按任意键退出")
        return
Esempio n. 19
0
    def run(self):
        self._create_socket()
        print(f'server started on  {self.config.host}:{self.config.port}')

        workers = []
        for x in range(self.config.cpu_limit):
            w = Worker(self._sock, self.config)
            workers.append(w)
            w.start()
        try:
            for w in workers:
                w.join()
        except KeyboardInterrupt:
            for w in workers:
                w.terminate()
        finally:
            self._sock.close()
    def test23webAccessibleResources(self):
        worker = Worker()
        self.source += 'test23webAccessibleResources'
        self.destination = self.source + '_delete'
        worker.work(self.source)

        expected = 3
        actual = worker.wrapper.getManifestVersion()
        self.assertEqual(actual, expected, 'manifest_version')

        manifest = worker.wrapper.manifest
        self.assertIn('background', manifest)
        self.assertIn('service_worker', manifest['background'])
        self.assertEqual(manifest['background']['service_worker'],
                         'service_worker.js')

        self.assertIn('permissions', manifest)
        self.assertIn('scripting', manifest['permissions'])

        key = 'web_accessible_resources'
        self.assertIn(key, manifest)
        self.assertEqual(len(manifest[key][0]['resources']), 2)

        shutil.rmtree(self.destination)
Esempio n. 21
0
def test_gas_price_update(load_unprocessed_document, generate_gas_price):

    gas_price = 111
    generate_gas_price.return_value = gas_price

    config = Config.from_environ()
    config['Blockchain']['GasPrice'] = 'fast'
    config['Blockchain']['GasPriceRefreshRate'] = 2

    key = 'document-key'
    document = DOCUMENT_V2_TEMPLATE.substitute(
        DocumentStoreAddress=config['DocumentStore']['Address'])
    document = json.loads(document)
    load_unprocessed_document.return_value = key, document

    message = mock.Mock()
    message.body = json.dumps({'Records': [{}]})

    worker = Worker(config)

    worker.web3.eth.sendRawTransaction.return_value = b'transaction-hash'
    worker.web3.eth.waitForTransactionReceipt().status = 1
    assert worker.gas_price == gas_price

    # testing transaction timeout causing gas price increase
    generate_gas_price.reset_mock()
    worker.web3.eth.waitForTransactionReceipt.side_effect = TimeExhausted
    assert not worker.process_message(message)
    generate_gas_price.assert_not_called()
    assert worker.gas_price == int(gas_price * 1.1)

    # testing gas price refresh
    generate_gas_price.reset_mock()
    worker.web3.eth.waitForTransactionReceipt.side_effect = None
    for i in range(config['Blockchain']['GasPriceRefreshRate']):
        assert worker.process_message(message)
    generate_gas_price.assert_called_once()

    # testing no refresh on static gas price
    config['Blockchain']['GasPrice'] = 20
    worker = Worker(config)

    worker.web3.eth.sendRawTransaction.return_value = b'transaction-hash'
    worker.web3.eth.waitForTransactionReceipt().status = 1

    worker.web3.reset_mock()
    for i in range(config['Blockchain']['GasPriceRefreshRate']):
        assert worker.process_message(message)
    worker.web3.eth.generateGasPrice.assert_not_called()
Esempio n. 22
0
 def update(self):
     conn = connection.get()
     self.model.set_connection(conn)
     try:
         mapped = self.model.mapped(update_required=True)
     except Exception as e:
         msg = Message(self.ui, Message.ERROR, self.messages[4], str(e))
         msg.show()
     else:
         if mapped:
             pd = Progress(self.ui, self.messages[9], 0, len(mapped))
             pd.show()
             worker = Worker(self.updater, mapped)
             worker.signals.progress.connect(pd.setValue)
             worker.signals.error.connect(pd.close)
             worker.signals.error.connect(self.update_error)
             worker.signals.done.connect(self.update_done)
             QThreadPool.globalInstance().start(worker)
             self.tab.btnUpdate.setDisabled(True)
     finally:
         conn.close()
Esempio n. 23
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = FileColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "毛泽东军事文选"

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 0
        max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址

        path = '/Users/ink/Desktop/ht'

        list = os.listdir(path)  #列出文件夹下所有的目录与文件
        for i in list:
            # print i

            if str(i).endswith('htm') or str(i).endswith('html'):
                filename = u'/Users/ink/Desktop/ht/{}'.format(i)
                convert_encoding(filename, 'utf-8')
                f = open(filename)
                contents = f.read()
                # print(contents)
                # gb2312 转
                article_info = FileArticleParser(contents).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = i
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])

                f.close()
Esempio n. 24
0
    def catch(account_id):
        # 关键就在这里了
        u"""

        :param target_url: https://xueqiu.com/4065977305
        :return:
        """
        mock_sleep_time = 0.5

        article_url_index_list = []
        #   获取最大页码
        # url = 'http://chuansong.me/account/{}'.format(account_id)
        # front_page_content = Http.get_content(url)
        # max_page = XueQiuWorker.parse_max_page(front_page_content)

        # _url = "http://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=2" ''是all  2主贴  5 回复
        _url = "https://xueqiu.com/v4/statuses/user_timeline.json?user_id={0}&page={1}&type=0"
        first = _url.format(account_id, 1)
        r = Http.get_json_content(first)
        max_page = 1
        try:
            jdata = json.loads(r.text, encoding='utf-8')
            max_page = jdata['maxPage'] + 1
        except KeyError as   e:
            print  '打开失败 >>>>>>> Cookie'
        # max_page = 1
        #   分析网页内容,存到数据库里
        #   需要验证码

        content_profile = Http.get_content(u'https://xueqiu.com/u/{}/profile'.format(account_id))

        column_info = XueQiuColumnParser(content_profile).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = ""
        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    column_info[u'title'] = line.split('#')[1]
                    column_info[u'image_url'] = str(line.split('#')[2]).strip('\n')

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))

        #

        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(1, max_page):
            request_url = _url.format(account_id, raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取{countert}号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                 article_count=len(index_work_set)))

                content = Http.get_content(request_url)
                if not content:
                    return
                jdata = json.loads(content)
                articles = jdata['statuses']
                for article in articles:
                    # print article

                    article_info = XueQiuArticleParser(article).get_article_info()
                    if len(article_info) > 0:
                        article_info['column_id'] = account_id
                        Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]

                Debug.logger.debug(u' {} 的内容抓取完成'.format(request_url))

        return
Esempio n. 25
0
def start():
    '''Start the Application'''

    log = configure_log()
    log.info('Starting Cloud Worker Node Agent')
    log.info('--------------------------')

    args = parse_args()

    settings = {'base_url': args.server,
                'secret': args.secret,
                'client_id': C.CLIENT_ID,
                'client_secret': C.CLIENT_SECRET,
                'username': C.USERNAME,
                'password': C.PASSWORD}

    server = Server(settings)

    node = Node(server)

    #Send the hostname, ip etc to the server
    node.send_info()

    #Update the node status to ready
    node.update_node_status(C.STATUS_READY)

    #Get Config
    config = Config(server, node)

    actions = Action(server, node)
    processor = Processor()
    workers = Worker(server, node, processor)
    output = Output(server, node)

    finished = False

    #Loop forever (kind of)
    while not finished:
        log.info('Looping')
        log.info('--------------------------')

        #Update last seen date
        node.update_node_date()

        #Get config
        config.refresh()

        #Get actions
        num_pending = actions.get_pending()

        #Respond to actions
        if actions.has_pending():
            message = 'Responding to %d Actions ...' % num_pending
            output.send(message)

            actions.respond_to_pending()


        #Get workers/commands
        workers.refresh()
        workers.process_workers()

        #TODO
        #Respond to/run commands
        #Send output to server


        log.info('Sleeping for %d seconds ...', 
                 config.get(C.CONFIG_POLL_PERIOD))
        time.sleep(config.get(C.CONFIG_POLL_PERIOD))
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码
        url = 'https://www.huxiu.com/{}'.format(account_id)
        front_page_content = Http.get_content(url)

        # Config.now_id_likeName = account_id
        # Config.save()

        column_info = HuXiuColumnParser(front_page_content).get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        u_result = urllib.quote(
            account_id.decode(sys.stdin.encoding).encode('utf8'))
        print account_id
        max_page = 2

        idds = ''
        #
        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                split_url = line.split('#')[0]
                if split_url.split('/')[-1] == account_id:
                    # Config.now_id_likeName = line.split('#')[1]
                    max_page = int(line.split('#')[-1]) + 1
                    idds = str(line.split('#')[1])
                    print max_page
        max_page = -1
        #   分析网页内容,存到数据库里

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #   获取每一页中文章的地址的地址
        for raw_front_page_index in range(0, max_page + 1):
            #https://www.huxiu.com/search.html?s=%E5%B7%B4%E8%8F%B2%E7%89%B9&sort=dateline:desc
            request_url = u'https://www.huxiu.com/search.html?s={}&sort=dateline%3Adesc&per_page={}'.format(
                u_result, raw_front_page_index)
            #request_url = u'https://www.huxiu.com/member/{}/article/{}.html'.format(idds,raw_front_page_index)
            # request_url = 'https://www.huxiu.com/member/1872007.html'
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, "lxml")

                list_pcyc_l_ = soup.find_all('li')
                # list_pcyc_l_ = soup.find_all('div',class_='mob-ctt')
                for tgo_right in list_pcyc_l_:
                    for link in tgo_right.findAll('a'):
                        hre = str(link.get('href'))
                        if hre.startswith('/article/', 0, 10):
                            print u'https://www.huxiu.com{}'.format(
                                link.get('href'))
                            article_url_index_list.append(
                                'https://www.huxiu.com{}'.format(
                                    link.get('href')))

                del index_work_set[raw_front_page_index]

        article_url_index_list.append(
            'https://www.huxiu.com/article/299355.html')

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = HuXiuArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 27
0
 def setUp(self):
     self.workers = [
         Worker(init_hyperParam=[0, 1], init_theta=[0.9, 0.9]),
         Worker(init_hyperParam=[1, 0], init_theta=[0.5, 0.5])
     ]
Esempio n. 28
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 1
        max_sleep_time = 1

        article_url_index_list = []
        #   获取最大页码

        column_info = WeiXinColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = account_id
        column_info[u'image_url'] = 'https://wpimg.wallstcn.com/3598b719-ab0d-4be7-bc09-30c3ae29a3cc.jpg?imageView2/1/w/240/h/240'
        max_page = 1
        # with open('ReadList.txt', 'r') as read_list:
        #     read_list = read_list.readlines()
        #     for line in read_list:
        #         split_url = line.split('#')[0]
        #         if str(split_url).__contains__(account_id):
        #             # Config.now_id_likeName = line.split('#')[1]
        #             max_page = int(line.split('#')[-1]) + 1
        #             column_info[u'title'] = str(line.split('#')[1])
        #
        #             # max_page = 1
        #             print max_page



        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))


        # article_url_index_list.append('https://mp.weixin.qq.com/s?__biz=MjM5MjczNDc0Mw==&mid=2650847984&idx=2&sn=b7b111e5964d2f2fb568ba0d419e3edf&chksm=bd55d1888a22589e2f3bab0613b346427079efc6b82fac869d4f78244a500c3e5cc8cb8402ed&scene=21#wechat_redirect')
        # article_url_index_list.append('https://mp.weixin.qq.com/s/yj1BT3jWyxLjlEnzz0vEtQ')

        with open('/Users/0/Desktop/list.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                article_url_index_list.append(str(line).strip('\n'))

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print  'query : ' + article_url_index
            article_db = DB.query_row(
                    'select count(*) as article_count from Article where article_id = "{}"'.format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(u"开始抓取  {countert} 号文章,剩余{article_count}篇".format(countert=article_url_index,
                                                                                    article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)
                time.sleep(mock_sleep_time)
                if len(request_url_content) == 0:
                    random_sleep_time = base_sleep_time + random.randint(0, max_sleep_time) / 100.0
                    Debug.logger.info(u"随机休眠{}秒".format(random_sleep_time))
                    time.sleep(random_sleep_time)
                    continue
                #article_info = Todo2ArticleParser(request_url_content).get_article_info()
                # article_info = HuXiuArticleParser(request_url_content).get_article_info()
                article_info = WeiXinArticleParser(request_url_content).get_article_info()
                # article_info = WallStreetArticleParser(request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
    def catch(account_id):
        # 关键就在这里了

        article_url_index_list = []
        #   获取最大页码
        url = 'http://www.taoguba.com.cn/Article/' + account_id + '/1'
        front_page_content = Http.get_content(url)
        star_page = 1

        with open('ReadList.txt', 'r') as read_list:
            read_list = read_list.readlines()
            for line in read_list:
                if str(line).__contains__('#'):
                    split_url = line.split('#')[0]
                    trgId = split_url.split('/')[-2]
                    if trgId == account_id:
                        pg = (split_url.split('/')[-1])
                        print pg
                        star_page = int(pg)

                        if star_page == 0:
                            star_page = 1
                        else:
                            print star_page

        max_page = 2
        dom = BeautifulSoup(front_page_content, "lxml")
        list_pcyc_l_ = dom.find_all('div', class_="left t_page01")
        try:
            for tgo_tgo_ in list_pcyc_l_:
                linkl = tgo_tgo_.findAll('a')
                tarUrl = linkl[0].get('href')
                max_page = int(tarUrl.split('/')[3])

        except IndexError as e:
            max_page = 1
        column_info = TGBColumnParser(front_page_content).get_column_info()

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        # 获取每一页中文章的地址的地址
        # star_page = 100
        for raw_front_page_index in range(star_page, max_page + 1):
            request_url = 'http://www.taoguba.com.cn/Article/' + account_id + '/' + str(
                raw_front_page_index)
            article_url_index_list.append(request_url)

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = TGBArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 30
0
if __name__ == '__main__':
    PARSER = argparse.ArgumentParser(
        description='Client tool for DoEnjoy mini-Spark')
    subparsers = PARSER.add_subparsers()
    init_wordcount_client_parser(subparsers)
    init_pagerank_client_parser(subparsers)
    init_wordcount_streaming_client_parser(subparsers)
    init_master_parser(subparsers)
    init_worker_parser(subparsers)

    ARGS = PARSER.parse_args()
    if ARGS.action == 'master':
        master = Master(ARGS.port, ARGS.debug)
        master.run()
    elif ARGS.action == 'worker':
        worker = Worker(ARGS.master_address, ARGS.self_address, ARGS.debug)
        worker.run()
    elif ARGS.action == 'page_rank':
        page_rank_client = PageRankClient(ARGS.file_path, ARGS.iterative)
        client = get_client(ARGS.master_address)
        execute_command(client, client.get_job,
                        pickle_object(page_rank_client), ARGS.self_address)
        page_rank_client.start_server("0.0.0.0")
    elif ARGS.action == 'word_count':
        word_count_client = WordCountClient(ARGS.file_path)
        client = get_client(ARGS.master_address)
        execute_command(client, client.get_job,
                        pickle_object(word_count_client), ARGS.self_address)
        word_count_client.start_server("0.0.0.0:" +
                                       ARGS.self_address.split(":")[1])
    elif ARGS.action == 'wordcount_streaming':
Esempio n. 31
0
    def catch(account_id):
        # 关键就在这里了

        mock_sleep_time = 0.5
        base_sleep_time = 10
        max_sleep_time = 10

        article_url_index_list = []
        #   获取最大页码

        column_info = Todo3ColumnParser('').get_column_info()
        column_info[u'column_id'] = account_id
        column_info[u'title'] = "新能源汽车"
        column_info['article_count'] = 0
        column_info['follower_count'] = 0
        column_info['description'] = ''
        column_info['image_url'] = ''

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])
        star_page = 1
        max_page = 1

        from src.worker import Worker
        Worker.save_record_list(u'Column', [column_info])

        Debug.logger.info(u"最大页数抓取完毕,共{max_page}页".format(max_page=max_page))
        index_work_set = OrderedDict()
        #获取每一页中文章的地址的地址
        for raw_front_page_index in range(star_page, max_page):
            request_url = u'https://post.smzdm.com/fenlei/xinnengyuanche/p{}/'.format(
                raw_front_page_index)
            index_work_set[raw_front_page_index] = request_url

        re_catch_counter = 0
        catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for raw_front_page_index in index_work_set:
                catch_counter += 1
                Debug.logger.info(u'第『{}』遍抓取数据'.format(re_catch_counter))
                request_url = index_work_set[raw_front_page_index]
                Debug.logger.info(
                    u"开始抓取第{raw_front_page_index}页中的文章链接,剩余{max_page}页".format(
                        raw_front_page_index=raw_front_page_index,
                        max_page=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                soup = BeautifulSoup(request_url_content, 'lxml')
                list_p_list = soup.find_all('div',
                                            class_='list-border clearfix')
                for p in list_p_list:
                    # print p
                    list_pcyc_li = p.find_all('a')
                    li = list_pcyc_li[0]

                    tarUrl = li.get('href')
                    ttt = str(tarUrl).split("#")[-1]
                    print ttt
                    if not (ttt is None):
                        article_url_index_list.append(ttt)

                del index_work_set[raw_front_page_index]

        article_count = len(article_url_index_list)
        Debug.logger.info(u"文章链接抓取完毕,共{article_count}篇文章待抓取".format(
            article_count=article_count))

        index_work_set = OrderedDict()
        for article_url_index in article_url_index_list:
            print 'query : ' + article_url_index
            article_db = DB.query_row(
                'select count(*) as article_count from Article where article_id = "{}"'
                .format(article_url_index))
            if article_db['article_count'] > 0:
                continue

            request_url = article_url_index

            index_work_set[article_url_index] = request_url

        re_catch_counter = 0
        while len(index_work_set) > 0 and re_catch_counter <= 20:
            re_catch_counter += 1
            for article_url_index in index_work_set:
                request_url = index_work_set[article_url_index]
                Debug.logger.info(
                    u"开始抓取{countert}号文章,剩余{article_count}篇".format(
                        countert=article_url_index,
                        article_count=len(index_work_set)))
                request_url_content = Http.get_content(request_url)

                article_info = Todo3ArticleParser(
                    request_url_content).get_article_info()
                if len(article_info) > 0:
                    article_info['article_id'] = article_url_index
                    article_info['column_id'] = account_id
                    Worker.save_record_list(u'Article', [article_info])
                del index_work_set[article_url_index]
        return
Esempio n. 32
0
if __name__ == '__main__':
    PARSER = argparse.ArgumentParser(
        description='Client tool for DoEnjoy mini-Spark')
    subparsers = PARSER.add_subparsers()
    init_wordcount_client_parser(subparsers)
    init_pagerank_client_parser(subparsers)
    init_wordcount_streaming_client_parser(subparsers)
    init_master_parser(subparsers)
    init_worker_parser(subparsers)

    ARGS = PARSER.parse_args()
    if ARGS.action == 'master':
        master = Master(ARGS.port, ARGS.debug)
        master.run()
    elif ARGS.action == 'worker':
        worker = Worker(ARGS.master_address, ARGS.self_address, ARGS.debug)
        worker.run()
    elif ARGS.action == 'page_rank':
        page_rank_client = PageRankClient(ARGS.file_path, ARGS.iterative)
        client = get_client(ARGS.master_address)
        execute_command(client,
                        client.get_job,
                        pickle_object(page_rank_client),
                        ARGS.self_address)
        page_rank_client.start_server("0.0.0.0")
    elif ARGS.action == 'word_count':
        word_count_client = WordCountClient(ARGS.file_path)
        client = get_client(ARGS.master_address)
        execute_command(client,
                        client.get_job,
                        pickle_object(word_count_client),