def __init__(self, host, port):
     SocketServer.ThreadingTCPServer.__init__(self, (host, port),
                                              harvestManUrlHandler)
     self.host, self.port = self.socket.getsockname()
     # For storing data from crawlers
     self.urls = PriorityQueue(0)
     # For storing data from fetchers
     self.urls2 = PriorityQueue(0)
     self.urlmap = {}
     self.flag = True
Esempio n. 2
0
    def __init__(self, host, port, protocol='tcp'):
        self.urls = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}
        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self.bind((self.host, port))
        except socket.error:
            raise

        self.listen(5)
Esempio n. 3
0
class HarvestManUrlServer(asyncore.dispatcher_with_send):
    """ An asynchronous url server class for HarvestMan.
    This class can replace the url queue and work as a url
    server multiplexing several url requests simultaneously """

    def __init__(self, host, port, protocol='tcp'):
        self.urls = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}
        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self.bind((self.host, port))
        except socket.error:
            raise

        self.listen(5)

    def get_port(self):
        return self.port

    def seturl(self, url):
        self.urlmap['last'] = url

    def geturl(self):
        return self.urlmap['last']

    def handle_accept(self):
        newSocket, address = self.accept()
        secondary_url_server(sock=newSocket, addr=address,
                             url_server=self)

    def handle_close(self):
        pass

    def notify(self, handler):
        """ Notify method for secondary socket server
        to add urls. (Not Used) """

        for url in handler.urls:
            self.urls.put(url)
Esempio n. 4
0
class HarvestManUrlServer(asyncore.dispatcher_with_send):
    """ An asynchronous url server class for HarvestMan.
    This class can replace the url queue and work as a url
    server multiplexing several url requests simultaneously """
    def __init__(self, host, port, protocol='tcp'):
        self.urls = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}
        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self.bind((self.host, port))
        except socket.error:
            raise

        self.listen(5)

    def get_port(self):
        return self.port

    def seturl(self, url):
        self.urlmap['last'] = url

    def geturl(self):
        return self.urlmap['last']

    def handle_accept(self):
        newSocket, address = self.accept()
        secondary_url_server(sock=newSocket, addr=address, url_server=self)

    def handle_close(self):
        pass

    def notify(self, handler):
        """ Notify method for secondary socket server
        to add urls. (Not Used) """

        for url in handler.urls:
            self.urls.put(url)
Esempio n. 5
0
    def __init__(self, host, port, protocol='tcp'):
        self.urls = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}
        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            self.bind((self.host, port))
        except socket.error:
            raise

        self.listen(5)
    def __init__(self, host, port, protocol='tcp'):
        # For storing data from crawlers
        self.urls = PriorityQueue(0)
        # For storing data from fetchers
        self.urls2 = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}

        # Count of gets
        self.count1 = 0
        self.count2 = 0

        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)

        try:
            self.bind((self.host, port))
            self.port = self.getsockname()[1]
        except socket.error:
            raise

        self.listen(20)
Esempio n. 7
0
    def __init__(self, host, port, protocol='tcp'):
        # For storing data from crawlers
        self.urls = PriorityQueue(0)
        # For storing data from fetchers
        self.urls2 = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}

        # Count of gets
        self.count1 = 0
        self.count2 = 0
        
        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)

        try:
            self.bind((self.host, port))
            self.port = self.getsockname()[1]
        except socket.error:
            raise

        self.listen(20)
class HarvestManUrlServer(asyncore.dispatcher_with_send):
    """ An asynchronous url server class for HarvestMan.
    This class can replace the url queue and work as a url
    server multiplexing several url requests simultaneously """
    def __init__(self, host, port, protocol='tcp'):
        # For storing data from crawlers
        self.urls = PriorityQueue(0)
        # For storing data from fetchers
        self.urls2 = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}

        # Count of gets
        self.count1 = 0
        self.count2 = 0

        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)

        try:
            self.bind((self.host, port))
            self.port = self.getsockname()[1]
        except socket.error:
            raise

        self.listen(20)

    def get1(self):
        #if self.count1>1:
        #    return self.urls.get()
        #else:
        return self.urls.get_nowait()

    def get2(self):
        #if self.count2>1:
        #    return self.urls2.get()
        #else:
        return self.urls2.get_nowait()

    def get_port(self):
        return self.port

    def seturl(self, url):
        self.urlmap['lasturl'] = url

    def seturllist(self, urllist):
        self.urlmap['lastlist'] = urllist

    def geturl(self):
        return self.urlmap['lasturl']

    def geturllist(self):
        return self.urlmap['lastlist']

    def handle_accept(self):

        newSocket, address = self.accept()
        # print newSocket, address
        sec = secondary_url_server(sock=newSocket,
                                   addr=address,
                                   url_server=self)

    def handle_close(self):
        pass

    def handle_expt(self):
        pass

    def notify(self, handler):
        """ Notify method for secondary socket server
        to add urls. (Not Used) """

        for url in handler.urls:
            self.urls.put(url)
Esempio n. 9
0
class HarvestManUrlServer(asyncore.dispatcher_with_send):
    """ An asynchronous url server class for HarvestMan.
    This class can replace the url queue and work as a url
    server multiplexing several url requests simultaneously """

    def __init__(self, host, port, protocol='tcp'):
        # For storing data from crawlers
        self.urls = PriorityQueue(0)
        # For storing data from fetchers
        self.urls2 = PriorityQueue(0)
        self.port = port
        self.host = host
        self.protocol = protocol
        self.urlmap = {}

        # Count of gets
        self.count1 = 0
        self.count2 = 0
        
        asyncore.dispatcher_with_send.__init__(self)
        self.create_socket(socket.AF_INET, socket.SOCK_STREAM)

        try:
            self.bind((self.host, port))
            self.port = self.getsockname()[1]
        except socket.error:
            raise

        self.listen(20)

    def get1(self):
        #if self.count1>1:
        #    return self.urls.get()
        #else:
        return self.urls.get_nowait()

    def get2(self):
        #if self.count2>1:
        #    return self.urls2.get()
        #else:
        return self.urls2.get_nowait()        
            
    def get_port(self):
        return self.port

    def seturl(self, url):
        self.urlmap['lasturl'] = url

    def seturllist(self, urllist):
        self.urlmap['lastlist'] = urllist
        
    def geturl(self):
        return self.urlmap['lasturl']

    def geturllist(self):
        return self.urlmap['lastlist']
    
    def handle_accept(self):

        newSocket, address = self.accept()
        # print newSocket, address
        sec = secondary_url_server(sock=newSocket, addr=address,url_server=self)

    def handle_close(self):
        pass

    def handle_expt(self):
        pass
    
    def notify(self, handler):
        """ Notify method for secondary socket server
        to add urls. (Not Used) """

        for url in handler.urls:
            self.urls.put(url)