def linkStatus(self, url): urlParse = URLparser() r = Request() host, port, path, file = urlParse.parse(url) request = r.createGETReq(host, path, file) ip = self.getIP(host) found = False self.createSocket() if self.sock is None or ip is None or request is None: self.closeSocket() return found gReq = str.encode(request) try: self.sock.settimeout(5.0) self.sock.connect((ip, port)) self.sock.send(gReq) response = self.sock.recv(4096) getList = [] while len(response) > 0: getList.append(response.decode("utf-8", "ignore")) response = self.sock.recv(4096) getResponse = "".join(getList) # print(getResponse) if "HTTP/1.0 2" in getResponse or "HTTP/1.1 2" in getResponse: found = True except socket.error as error: found = False finally: self.closeSocket() return found
def run(self): info = [""] url = [""] connection = TCPsocket() while True: self.threadLock.acquire() if self.urlID[0] > self.totalCount[0]: self.threadLock.release() return url[0] = self.urls.get() self.urlID[0] += 1 info.append("\nURL: " + url[0]) # INFO ADD: URL self.threadLock.release() urlParse = URLparser() r = Request() host, port, path, file = urlParse.parse(url[0]) info.append("\tParsing URL... host " + host + ", port " + str(port) + "\n") # INFO ADD: Host and Port connection.createSocket() if host not in self.uniqueHosts: self.uniqueHosts.add(host) info.append("\tChecking host uniqueness... passed\n") # INFO ADD: Unique Host Passed else: info.append("\tChecking host uniqueness... failed\n") # INFO ADD: Unique Host Failed connection.closeSocket() # self.printInfo(info) continue ip = connection.getIP(host) info.append("\tDoing DNS... " + (("done, found on: " + ip) if ip is not None else "failed") + "\n") if ip is not None: self.threadLock.acquire() self.dnslookups[0] += 1 self.threadLock.release() if ip not in self.uniqueIPs: self.uniqueIPs.add(host) info.append("\tChecking ip uniqueness... passed\n") # INFO ADD: Unique Host Passed else: info.append("\tChecking ip uniqueness... failed\n") # INFO ADD: Unique Host Failed connection.closeSocket() # self.printInfo(info) continue hReq = r.createHEADReq(host) info.append("\tConnecting on robots... done\n") # INFO ADD: Robots robot, hInfo = connection.robots(host, port, str.encode(hReq)) info.append(hInfo) info.append("\tVerifying header... " + ("found" if robot else "failed") + "\n") if robot: self.threadLock.acquire() self.robots[0] += 1 self.threadLock.release() else: info.append("\tConnecting on page... done" + "\n") connection.createSocket() gReq = r.createGETReq(host, path, file) connected, status, count, size = connection.crawl(host, port, str.encode(gReq)) info.append("\tLoading... " + ("success" if connected else "failed") + "\n") self.threadLock.acquire() if status == 2: self.status2xx[0] += 1 elif status == 3: self.status3xx[0] += 1 elif status == 4: self.status4xx[0] += 1 elif status == 5: self.status5xx[0] += 1 else: self.status_o[0] += 1 self.bytes[0] += size self.links[0] += count self.threadLock.release() # for link in pagelinks: # connection.createSocket() # if connection.linkStatus(link): # self.link2xx[0] += 1 self.printInfo(info)