def get_recovery_dir_path(ref: str = ""):
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Recovery/"
    else:
        temp_loc = "/usr/local/DomainFinder/Recovery/"
    return temp_loc + ref
def get_proxy_file_path():
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Proxy/proxy_list.csv"
    else:
        temp_loc = "/usr/local/DomainFinder/Database/proxy_list.csv"
    return temp_loc
def get_marketplace_db_path(ref: str=""):
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Marketplace/"
    else:
        temp_loc = "/usr/local/DomainFinder/Database/Marketplace/"
    return temp_loc + ref
def get_task_backup_dir(ref: str=""):
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Task/"
    else:
        temp_loc = "/usr/local/DomainFinder/Task/"
    return temp_loc + ref
def get_marketplace_db_path(ref: str = ""):
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Marketplace/"
    else:
        temp_loc = "/usr/local/DomainFinder/Database/Marketplace/"
    return temp_loc + ref
def get_recovery_dir_path(ref: str=""):
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Recovery/"
    else:
        temp_loc = "/usr/local/DomainFinder/Recovery/"
    return temp_loc + ref
def get_task_backup_dir(ref: str = ""):
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Task/"
    else:
        temp_loc = "/usr/local/DomainFinder/Task/"
    return temp_loc + ref
def get_proxy_file_path():
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/Proxy/proxy_list.csv"
    else:
        temp_loc = "/usr/local/DomainFinder/Database/proxy_list.csv"
    return temp_loc
def get_db_buffer_default_dir():
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/DatabaseBuf/"
    else:
        temp_loc = "/tmp/DatabaseBuf/"
    return temp_loc
def get_db_buffer_default_dir():
    machine_tpye = MachineInfo.get_machine_type()
    if machine_tpye == MachineType.Windows:
        temp_loc = "D:/SQLiteDB/Temp/DatabaseBuf/"
    else:
        temp_loc = "/tmp/DatabaseBuf/"
    return temp_loc
Beispiel #11
0
 def run(self):
     #print("running memory monitor")
     while True:
         if self._stop_event.is_set():
             #print("external stop")
             break
         else:
             mem = MachineInfo.get_memory_process(self._pid)
             #print("process use: ", mem, " MB")
             if mem > self._mem_limit:
                 self._exceed_limit = True
                 self._callback(True)
             time.sleep(self._wait)
 def run(self):
     #print("running memory monitor")
     while True:
         if self._stop_event.is_set():
             #print("external stop")
             break
         else:
             mem = MachineInfo.get_memory_process(self._pid)
             #print("process use: ", mem, " MB")
             if mem > self._mem_limit:
                 self._exceed_limit = True
                 self._callback(True)
             time.sleep(self._wait)
 def get_default_address(source_type: str):
     DB_prefix = "/usr/local/DomainFinder/Database"  # this is for Linux
     if MachineInfo.get_machine_type() == MachineType.Windows:
         DB_prefix = "D:/SQLiteDB"
     if source_type == SiteSource.Seed:
         return DB_prefix + "/SeedSitesList"
     elif source_type == SiteSource.AllExternal:
         return DB_prefix + "/ResultSitesList"
     elif source_type == SiteSource.Flitered:
         return DB_prefix + "/FilteredSitesList"
     elif source_type == SiteSource.Filtered_bad:
         return DB_prefix + "/FilteredSitesList_Bad"
     else:
         return ":memory:"
 def get_default_address(source_type: str):
     DB_prefix = "/usr/local/DomainFinder/Database"  # this is for Linux
     if MachineInfo.get_machine_type() == MachineType.Windows:
         DB_prefix = "D:/SQLiteDB"
     if source_type == SiteSource.Seed:
         return DB_prefix + "/SeedSitesList"
     elif source_type == SiteSource.AllExternal:
         return DB_prefix + "/ResultSitesList"
     elif source_type == SiteSource.Flitered:
         return DB_prefix + "/FilteredSitesList"
     elif source_type == SiteSource.Filtered_bad:
         return DB_prefix + "/FilteredSitesList_Bad"
     else:
         return ":memory:"
def get_spam_filter_keywords_file_path():
    keyword_file = "keywords.txt"
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/SQLiteDB/SpamFilter/' + keyword_file
    else:
        return '/usr/local/DomainFinder/SpamFilter/' + keyword_file
def get_spam_filter_anchors_file_path():
    anchor_file = "anchors.txt"
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/SQLiteDB/SpamFilter/' + anchor_file
    else:
        return '/usr/local/DomainFinder/SpamFilter/' + anchor_file
def get_download_file_path():
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/ChromeDownload/'
    else:
        return '/tmp/download/'
def get_chrome_exe_path():
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'C:/WebDrivers/chromedriver.exe'
    else:
        return '/usr/lib/chromium-browser/chromedriver'
 def get_optimal_capacity(self) -> int:
     mem_per_process = self.maxpagelimit * 50/1000  # normally it cost 0.005 MB per page
     total_mem = MachineInfo.get_memory()[0]
     return int(total_mem*0.7/mem_per_process)
def get_spam_filter_bad_country_path():
    country_file = "bad_country.txt"
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/SQLiteDB/SpamFilter/' + country_file
    else:
        return '/usr/local/DomainFinder/SpamFilter/' + country_file
def get_log_dir():
    WIN_PATH = "D:/SQLiteDB/Temp/Logging/"
    LINUX_PATH = "/tmp/Logging/"
    machine_type = MachineInfo.get_machine_type()
    return WIN_PATH if machine_type == MachineType.Windows else LINUX_PATH
    def send_and_receive(self):
        in_buffer = self.rfile
        out_buffer = self.wfile
        s = self.server.addtional_obj
        command = CommandProcessor.receive_command(in_buffer)
        #print("process cmd: ", command.cmd)
        if command is not None:
            reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK)
            if command.cmd == ServerCommand.Com_Start:
                #print("start conversation:")
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Stop_Mining:
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    s.set_stop()
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Setup:  # test this
                data = command.data
                if isinstance(data, SetupData):
                    cap2 = data.cap2
                    if cap2 == 0:
                        cap2 = SlaveAutoScaler(
                            data.max_page_level,
                            data.max_page_limit).get_optimal_capacity()
                    if isinstance(s,
                                  SiteCheckProcessManager):  # need to fix this
                        if s.is_alive():
                            s.set_stop()
                            s.join()
                    # elif isinstance(s, threading.Thread):
                    #     s.join(0)
                    print("init new process manager with para: ",
                          data.get_serializable(False))
                    total_memory = MachineInfo.get_memory()[0]
                    mem_limit_per_crawler = int(total_memory * 0.8 / cap2)
                    if mem_limit_per_crawler >= SiteCheckProcessManager.MEM_MINIMUM_REQ:
                        self.server.addtional_obj = SiteCheckProcessManager(
                            data.ref,
                            max_procss=cap2,
                            concurrent_page=data.cap3,
                            page_max_level=data.max_page_level,
                            max_page_per_site=data.max_page_limit,
                            memory_limit_per_process=mem_limit_per_crawler)
                        self.server.addtional_obj.start()
                    else:
                        reply.cmd = ServerCommand.Com_ReplyError
                        reply.data = "Not enough memory allocation for each crawler, must at least 100 Mb."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Clear_Cache:
                if isinstance(s, SiteCheckProcessManager):  # need to fix this
                    if s.is_alive():
                        s.set_stop()
                        s.join()
                    s.clear_cache()
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Data:
                print("incoming data....")
                data = command.data
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    if data.data is not None and len(data.data) > 0:
                        s.put_to_input_queue(data.data)
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with data"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Get_Data:  # test this
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    rawdata = []
                    if s.get_temp_result_count() > 0:
                        rawdata += [
                            ScrapeDomainData(x.link, x.response_code)
                            for x in s.get_temp_result_and_clear()
                            if isinstance(x, OnSiteLink)
                        ]
                    #print("sending back:")
                    #print(rawdata)
                    if s.get_site_info_list_count() > 0:
                        rawdata += [
                            info for info in s.get_site_info_list_and_clear()
                            if isinstance(info, SeedSiteFeedback)
                        ]
                    data = MiningList(s.name, rawdata)
                    reply.data = data
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with return data"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Status:
                #print("send back status!")
                CPU = MachineInfo.get_cpu(1)
                MEM = MachineInfo.get_memory()
                NET = MachineInfo.get_network(1)
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    manager_state = s.get_state()
                    filter_progress = s.get_filter_progress()
                    status = ServerStatus(
                        wait_job=manager_state.job_wait,
                        done_job=manager_state.job_done,
                        all_job=manager_state.job_all,
                        total_page_done=manager_state.total_page_done,
                        page_per_site=manager_state.average_page,
                        result=manager_state.result_count,
                        cpu_cores=CPU[0],
                        cpu_percent=CPU[1],
                        toal_memory=MEM[0],
                        memory_percent=MEM[1],
                        net_recieved=NET[0],
                        net_send=NET[1],
                        cap_slave=1,
                        cap_process=s.max_prcess,
                        cap_concurrent_page=s.concurrent_page,
                        filter_done=filter_progress[0],
                        filter_total=filter_progress[1])
                    reply.data = status
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with the crawler."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Stop:
                #print("end conversation:")
                return
            else:
                reply.cmd = ServerCommand.Com_ReplyError
                reply.data = "command is not valid, please try again"
                CommandProcessor.send_command(out_buffer, reply)
            #print("finished cmd", command.cmd)
            self.send_and_receive()
def get_default_archive_dir():
    DB_prefix = "/usr/local/DomainFinder/Archive/"  # this is for Linux
    if MachineInfo.get_machine_type() == MachineType.Windows:
        DB_prefix = "D:/SQLiteDB/Archive/"
    return DB_prefix
def get_chrome_exe_path():
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'C:/WebDrivers/chromedriver.exe'
    else:
        return '/usr/lib/chromium-browser/chromedriver'
def get_temp_db_dir():
    DB_prefix = "/usr/local/DomainFinder/Database/"  # this is for Linux
    if MachineInfo.get_machine_type() == MachineType.Windows:
        DB_prefix = "D:/SQLiteDB/"
    return DB_prefix
def get_spam_filter_bad_country_path():
    country_file = "bad_country.txt"
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/SQLiteDB/SpamFilter/' + country_file
    else:
        return '/usr/local/DomainFinder/SpamFilter/' + country_file
def get_spam_filter_keywords_file_path():
    keyword_file = "keywords.txt"
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/SQLiteDB/SpamFilter/' + keyword_file
    else:
        return '/usr/local/DomainFinder/SpamFilter/' + keyword_file
def get_temp_db_dir():
    DB_prefix = "/usr/local/DomainFinder/Database/"  # this is for Linux
    if MachineInfo.get_machine_type() == MachineType.Windows:
        DB_prefix = "D:/SQLiteDB/"
    return DB_prefix
def get_download_file_path():
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/ChromeDownload/'
    else:
        return '/tmp/download/'
 def get_optimal_capacity(self) -> int:
     mem_per_process = self.maxpagelimit * 50 / 1000  # normally it cost 0.005 MB per page
     total_mem = MachineInfo.get_memory()[0]
     return int(total_mem * 0.7 / mem_per_process)
def get_log_dir():
    WIN_PATH = "D:/SQLiteDB/Temp/Logging/"
    LINUX_PATH = "/tmp/Logging/"
    machine_type = MachineInfo.get_machine_type()
    return WIN_PATH if machine_type == MachineType.Windows else LINUX_PATH
    def send_and_receive(self):
        in_buffer = self.rfile
        out_buffer = self.wfile
        s = self.server.addtional_obj
        command = CommandProcessor.receive_command(in_buffer)
        #print("process cmd: ", command.cmd)
        if command is not None and isinstance(s, MiningMasterController):
            reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK)
            if command.cmd == ServerCommand.Com_Start:
                #print("start conversation:")
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Stop:
                #print("end conversation:")
                return  # exit point

            elif command.cmd == ServerCommand.Com_Get_DB_DATA:
                data = command.data
                if isinstance(data, DBRequestFields):
                    try:
                        reply.data = s.get_db_results(db_type=data.db_type, db_name=data.db_name, index=data.index, length=data.length)
                    except Exception as ex:
                        ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex,
                                              "cmd = ServerCommand.Com-Get-DB-DATA")
                        reply.cmd = ServerCommand.Com_ReplyError
                        reply.data = "Get DB data failed"
                else:
                    ErrorLogger.log_error("MasterRequestHandler.send_and_receive()",
                                          Exception("wrong data type recieved."), "cmd = ServerCommand.Com-Get-DB-DATA")
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Get DB data failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Remove_DB:
                data = command.data
                if isinstance(data, DBRequestFields):
                    try:
                        s.remove_db(db_type=data.db_type, db_name=data.db_name)
                    except Exception as ex:
                        ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex,
                                              "cmd = ServerCommand.Com_DB-RM-DB")
                        reply.cmd = ServerCommand.Com_ReplyError
                        reply.data = "Remove DB failed"
                else:
                    ErrorLogger.log_error("MasterRequestHandler.send_and_receive()",
                                          Exception("wrong data type recieved."), "cmd = ServerCommand.Com_DB-RM-DB")
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Remove DB failed"
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Start_Filter:
                data = command.data
                try:
                    if s.is_alive():
                        s.stop()
                        s.join()
                    if isinstance(data,  FilteringSetupData):  #MatrixFilterControl.FilteringSetupData
                        self.server.addtional_obj = MiningMasterController(ref=data.ref, accounts=data.accounts,
                                                                           crawl_matrix=data.crawl_matrix,
                                                                           filtering_only_mode=True,
                                                                           filtering_offset=data.offset,
                                                                           filtering_total=data.total
                                                                           )
                        self.server.addtional_obj.start()
                except Exception as ex:
                    print(ex)
                    ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Start_Filter()")
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Com_Start_Filter failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Setup:  # test this
                data = command.data
                try:
                    if s.is_alive():
                        s.stop()
                        s.join()
                    if isinstance(data, SetupData):
                        self.server.addtional_obj = MiningMasterController(ref=data.ref, accounts=data.accounts,
                                                                           cap_slave=data.cap,
                                                                           cap_slave_process=data.cap2,
                                                                           cap_concurrent_page=data.cap3,
                                                                           all_job=data.total,
                                                                           offset=data.offset,
                                                                           max_page_level=data.max_page_level,
                                                                           max_page_limit=data.max_page_limit,
                                                                           loopback_database=data.loopback,
                                                                           refresh_rate=data.refresh_rate,
                                                                           filters=data.db_filter,
                                                                           crawl_matrix=data.crawl_matrix,
                                                                           )
                        if data.addtional_data is not None and isinstance(data.addtional_data, SlaveOperationData):
                            self.add_slaves(self.server.addtional_obj, data.addtional_data)
                            self.server.addtional_obj.setup_minging_slaves()
                        self.server.addtional_obj.start()
                    else:
                        raise NotImplementedError("other data type is not implemented.")
                except Exception as ex:
                    print(ex)
                    ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Setup()")
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Setup failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Clear_Cache:
                try:
                    if s.is_alive():
                        s.stop()
                        s.join()
                    s.clear_host_cache()
                    s.clear_slave_cache()
                except Exception as ex:
                    print(ex)
                    ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Clear_Cache()")
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Setup failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Add_Seed:
                data = command.data
                if isinstance(data, MiningList):
                    s.add_seeds(data)
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Add Seed Failed, format is wrong in server handler."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Add_Slave: # test this
                try:
                    data = command.data
                    if isinstance(data, SlaveOperationData):
                        self.add_slaves(s, data)
                    else:
                        raise NotImplementedError("other data type is not implemented.")
                except Exception as ex:
                    print(ex)
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Add slave failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Del_Slave: # test this
                try:
                    data = command.data
                    if isinstance(data, SlaveOperationData):
                        self.remove_slaves(s, data)
                    else:
                        raise NotImplementedError("other data type is not implemented.")
                except Exception as ex:
                    print(ex)
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Add slave failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Begin_Mining:  # not implemented, use setup to begin mining
                reply.cmd = ServerCommand.Com_ReplyError
                reply.data = "Add slave failed"

            elif command.cmd == ServerCommand.Com_Stop_Mining:  # test this
                try:
                    EC2 = EC2Controller("")
                    addrs = [slave.address.address for slave in s.slaves if isinstance(slave, Server)]
                    s.pause()
                    #s.slaves.clear()
                    #if s.isAlive:
                    #    s.join(0)
                    #self.server.addtional_obj = MiningMasterController()

                    EC2.shut_down_machines_list(addrs)
                except:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "Stop site failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Status:  # test this
                try:
                    CPU = MachineInfo.get_cpu(1)
                    MEM = MachineInfo.get_memory()
                    NET = MachineInfo.get_network(1)
                    slaveStatus = [slave.status for slave in s.slaves]
                    totalPage = sum([slave.total_page_done for slave in slaveStatus])
                    ave_page = 0
                    filter_progress = s.get_filter_progress()
                    if len(s.slaves) > 0:
                        ave_page = int(sum([slave.page_per_site for slave in slaveStatus])/len(s.slaves))
                    total_result = sum([slave.result for slave in slaveStatus])
                    total_cap_slave = sum([slave.cap_slave for slave in slaveStatus])
                    total_cap_process = sum([slave.cap_slave * slave.cap_process for slave in slaveStatus])
                    total_cap_page = sum([slave.cap_slave * slave.cap_process * slave.cap_concurrent_page for slave in slaveStatus])
                    status = ServerStatus(wait_job=s.job_all - s.job_done, done_job=s.job_done, all_job=s.job_all,
                                          total_page_done=totalPage, page_per_site=ave_page,
                                          result=total_result, cpu_cores=CPU[0], cpu_percent=CPU[1],
                                          toal_memory=MEM[0], memory_percent=MEM[1], net_recieved=NET[0], net_send=NET[1],
                                          cap_slave=total_cap_slave, cap_process= total_cap_process, cap_concurrent_page= total_cap_page,
                                          filter_done=filter_progress[0], filter_total=filter_progress[1])
                    server = Server(server_type=ServerType.ty_Host, status=status, address=ServerAddress("localhost", MiningTCPServer.DefaultListenPort))
                    servers = []
                    servers.append(server)
                    servers += s.slaves
                    reply.data = MiningList(s.ref, servers)
                except:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "getting status failed"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_DataBase_Status:  # test this
                reply.data = s.get_db_stats()  # send back a copy
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Set_DB_Filter:
                data = command.data
                if isinstance(data, DBFilterCollection):
                    if data != s.filter_shadow:
                        s.filter_shadow = data
                        s.update_db_stats(True)
                else:
                    reply.cmd =ServerCommand.Com_ReplyError
                    reply.data = "wrong data type for filters, should be DBFilterCollection"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Progress: # this this
                reply.data = PrograssData(ref=s.ref, done=s.job_done, all_job=s.job_all,offset=s.offset,
                                          duration=s.end_time - s.start_time, in_progress=s.in_progress)
                CommandProcessor.send_command(out_buffer, reply)

            else:
                reply.cmd = ServerCommand.Com_ReplyError
                reply.data = "command is not valid, please try again"
                CommandProcessor.send_command(out_buffer, reply)

            #CommandProcessor.send_command(out_buffer, reply)
            #print("finished cmd ", command.cmd)
            self.send_and_receive()  # recursive to make a conversation
def get_default_archive_dir():
    DB_prefix = "/usr/local/DomainFinder/Archive/"  # this is for Linux
    if MachineInfo.get_machine_type() == MachineType.Windows:
        DB_prefix = "D:/SQLiteDB/Archive/"
    return DB_prefix
    def send_and_receive(self):
        in_buffer = self.rfile
        out_buffer = self.wfile
        s = self.server.addtional_obj
        command = CommandProcessor.receive_command(in_buffer)
        #print("process cmd: ", command.cmd)
        if command is not None:
            reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK)
            if command.cmd == ServerCommand.Com_Start:
                #print("start conversation:")
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Stop_Mining:
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    s.set_stop()
                CommandProcessor.send_command(out_buffer, reply)
            elif command.cmd == ServerCommand.Com_Setup:  # test this
                data = command.data
                if isinstance(data, SetupData):
                    cap2 = data.cap2
                    if cap2 == 0:
                        cap2 = SlaveAutoScaler(data.max_page_level, data.max_page_limit).get_optimal_capacity()
                    if isinstance(s, SiteCheckProcessManager):  # need to fix this
                        if s.is_alive():
                            s.set_stop()
                            s.join()
                    # elif isinstance(s, threading.Thread):
                    #     s.join(0)
                    print("init new process manager with para: ", data.get_serializable(False))
                    total_memory =MachineInfo.get_memory()[0]
                    mem_limit_per_crawler = int(total_memory * 0.8 / cap2)
                    if mem_limit_per_crawler >=  SiteCheckProcessManager.MEM_MINIMUM_REQ:
                        self.server.addtional_obj = SiteCheckProcessManager(data.ref, max_procss=cap2,
                                                                            concurrent_page=data.cap3,
                                                                            page_max_level=data.max_page_level,
                                                                            max_page_per_site=data.max_page_limit,
                                                                            memory_limit_per_process=mem_limit_per_crawler)
                        self.server.addtional_obj.start()
                    else:
                        reply.cmd = ServerCommand.Com_ReplyError
                        reply.data = "Not enough memory allocation for each crawler, must at least 100 Mb."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Clear_Cache:
                if isinstance(s, SiteCheckProcessManager):  # need to fix this
                    if s.is_alive():
                        s.set_stop()
                        s.join()
                    s.clear_cache()
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Data:
                print("incoming data....")
                data = command.data
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    if data.data is not None and len(data.data) > 0:
                        s.put_to_input_queue(data.data)
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with data"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Get_Data:  # test this
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    rawdata = []
                    if s.get_temp_result_count() > 0:
                        rawdata += [ScrapeDomainData(x.link, x.response_code) for x in s.get_temp_result_and_clear()
                                    if isinstance(x, OnSiteLink)]
                    #print("sending back:")
                    #print(rawdata)
                    if s.get_site_info_list_count() > 0:
                        rawdata += [info for info in s.get_site_info_list_and_clear()
                                    if isinstance(info, SeedSiteFeedback)]
                    data = MiningList(s.name, rawdata)
                    reply.data = data
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with return data"
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Status:
                #print("send back status!")
                CPU = MachineInfo.get_cpu(1)
                MEM = MachineInfo.get_memory()
                NET = MachineInfo.get_network(1)
                if s is not None and isinstance(s, SiteCheckProcessManager):
                    manager_state = s.get_state()
                    filter_progress = s.get_filter_progress()
                    status = ServerStatus(wait_job=manager_state.job_wait, done_job=manager_state.job_done,
                                          all_job=manager_state.job_all, total_page_done=manager_state.total_page_done,
                                          page_per_site=manager_state.average_page,
                                          result=manager_state.result_count, cpu_cores=CPU[0], cpu_percent=CPU[1],
                                          toal_memory=MEM[0], memory_percent=MEM[1], net_recieved=NET[0], net_send=NET[1],
                                          cap_slave=1, cap_process=s.max_prcess, cap_concurrent_page=s.concurrent_page,
                                          filter_done=filter_progress[0], filter_total=filter_progress[1])
                    reply.data = status
                else:
                    reply.cmd = ServerCommand.Com_ReplyError
                    reply.data = "something is wrong with the crawler."
                CommandProcessor.send_command(out_buffer, reply)

            elif command.cmd == ServerCommand.Com_Stop:
                #print("end conversation:")
                return
            else:
                reply.cmd = ServerCommand.Com_ReplyError
                reply.data = "command is not valid, please try again"
                CommandProcessor.send_command(out_buffer, reply)
            #print("finished cmd", command.cmd)
            self.send_and_receive()
def get_spam_filter_anchors_file_path():
    anchor_file = "anchors.txt"
    if MachineInfo.get_machine_type() == MachineType.Windows:
        return 'D:/SQLiteDB/SpamFilter/' + anchor_file
    else:
        return '/usr/local/DomainFinder/SpamFilter/' + anchor_file