def main(HOST=MiningTCPServer.DefaultListenAddr, PORT=MiningTCPServer.DefaultListenPort):
    addtional = SiteCheckProcessManager(max_procss=2)
    addtional.start()
    print("slave running at: " + HOST + " port: " + str(PORT))
    # Create the server, binding to localhost on port 9999
    server = MiningTCPServer((HOST, PORT), MiningRequestHandler, arg=addtional)
    server.serve_forever()
def main(HOST=MiningTCPServer.DefaultListenAddr,
         PORT=MiningTCPServer.DefaultListenPort):
    addtional = SiteCheckProcessManager(max_procss=2)
    addtional.start()
    print("slave running at: " + HOST + " port: " + str(PORT))
    # Create the server, binding to localhost on port 9999
    server = MiningTCPServer((HOST, PORT), MiningRequestHandler, arg=addtional)
    server.serve_forever()
 def testSlaveObj(self):
     site_list = [
         "http://www.bbc.co.uk/",
         "techcrunch.com",
         "mashable.com/category/tech/",
         "www.techradar.com/",
         "www.independent.co.uk/life-style/gadgets-and-tech/",
         "www.theguardian.com/uk/technology",
         "www.septitech.com/",
         "www.emosaustin.com/tech-n9ne-special-effects-tour",
     ]
     mananger = SiteCheckProcessManager(job_name="Test", concurrent_page=20, max_page_per_site=100)
     mananger.put_to_input_queue(site_list)
     mananger.start()
     while True:
         time.sleep(1)
Ejemplo n.º 4
0
 def testSlaveObj(self):
     site_list = [
         "http://www.bbc.co.uk/", "techcrunch.com",
         "mashable.com/category/tech/", "www.techradar.com/",
         "www.independent.co.uk/life-style/gadgets-and-tech/",
         "www.theguardian.com/uk/technology", "www.septitech.com/",
         "www.emosaustin.com/tech-n9ne-special-effects-tour"
     ]
     mananger = SiteCheckProcessManager(job_name="Test",
                                        concurrent_page=20,
                                        max_page_per_site=100)
     mananger.put_to_input_queue(site_list)
     mananger.start()
     while True:
         time.sleep(1)
    def send_and_receive(self):
        in_buffer = self.rfile
        out_buffer = self.wfile
        s = self.server.addtional_obj
        command = CommandProcessor.receive_command(in_buffer)
        #print("process cmd: ", command.cmd)
        if command is not None:
            reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK)
            if command.cmd == ServerCommand.Com_Start:
                #print("start conversation:")
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Stop_Mining:
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    s.set_stop()
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Setup:  # test this
                data = command.data
                if isinstance(data, SetupData):
                    cap2 = data.cap2
                    if cap2 == 0:
                        cap2 = SlaveAutoScaler(
                            data.max_page_level,
                            data.max_page_limit).get_optimal_capacity()
                    if isinstance(s,
                                  SiteCheckProcessManager):  # need to fix this
                        if s.is_alive():
                            s.set_stop()
                            s.join()
                    # elif isinstance(s, threading.Thread):
                    #     s.join(0)
                    print("init new process manager with para: ",
                          data.get_serializable(False))
                    total_memory = MachineInfo.get_memory()[0]
                    mem_limit_per_crawler = int(total_memory * 0.8 / cap2)
                    if mem_limit_per_crawler >= SiteCheckProcessManager.MEM_MINIMUM_REQ:
                        self.server.addtional_obj = SiteCheckProcessManager(
                            data.ref,
                            max_procss=cap2,
                            concurrent_page=data.cap3,
                            page_max_level=data.max_page_level,
                            max_page_per_site=data.max_page_limit,
                            memory_limit_per_process=mem_limit_per_crawler)
                        self.server.addtional_obj.start()
                    else:
                        reply.cmd = ServerCommand.Com_ReplyError
                        reply.data = "Not enough memory allocation for each crawler, must at least 100 Mb."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Clear_Cache:
                if isinstance(s, SiteCheckProcessManager):  # need to fix this
                    if s.is_alive():
                        s.set_stop()
                        s.join()
                    s.clear_cache()
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Data:
                print("incoming data....")
                data = command.data
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    if data.data is not None and len(data.data) > 0:
                        s.put_to_input_queue(data.data)
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with data"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Get_Data:  # test this
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    rawdata = []
                    if s.get_temp_result_count() > 0:
                        rawdata += [
                            ScrapeDomainData(x.link, x.response_code)
                            for x in s.get_temp_result_and_clear()
                            if isinstance(x, OnSiteLink)
                        ]
                    #print("sending back:")
                    #print(rawdata)
                    if s.get_site_info_list_count() > 0:
                        rawdata += [
                            info for info in s.get_site_info_list_and_clear()
                            if isinstance(info, SeedSiteFeedback)
                        ]
                    data = MiningList(s.name, rawdata)
                    reply.data = data
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with return data"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Status:
                #print("send back status!")
                CPU = MachineInfo.get_cpu(1)
                MEM = MachineInfo.get_memory()
                NET = MachineInfo.get_network(1)
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    manager_state = s.get_state()
                    filter_progress = s.get_filter_progress()
                    status = ServerStatus(
                        wait_job=manager_state.job_wait,
                        done_job=manager_state.job_done,
                        all_job=manager_state.job_all,
                        total_page_done=manager_state.total_page_done,
                        page_per_site=manager_state.average_page,
                        result=manager_state.result_count,
                        cpu_cores=CPU[0],
                        cpu_percent=CPU[1],
                        toal_memory=MEM[0],
                        memory_percent=MEM[1],
                        net_recieved=NET[0],
                        net_send=NET[1],
                        cap_slave=1,
                        cap_process=s.max_prcess,
                        cap_concurrent_page=s.concurrent_page,
                        filter_done=filter_progress[0],
                        filter_total=filter_progress[1])
                    reply.data = status
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with the crawler."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Stop:
                #print("end conversation:")
                return
            else:
                reply.cmd = ServerCommand.Com_ReplyError
                reply.data = "command is not valid, please try again"
                CommandProcessor.send_command(out_buffer, reply)
            #print("finished cmd", command.cmd)
            self.send_and_receive()