def is404(self, source): if Bundle().getString(31) in source: Logging(4, Bundle().getString(22), Bundle().getString(30)) return True else: #Logging(1,Bundle().getString(22) ,Bundle().getString(32)) return False
def nextLink(self): link = __http__(Bundle().getString(19) + "127.0.0.2", "", False, False).GET() if len(link) > 75: Logging(1, Bundle().getString(27), link[:75]) else: Logging(1, Bundle().getString(27), link) return link
def __execute__(self, command): Logging(1, Bundle().getString(28), str(command)) try: self.chrome.execute_script(str(command)) except JavascriptException: self.__execute__(command) raise Bundle().getString(48) except TypeError: pass except NoSuchWindowException: print("CLOSED JS")
def getCapcha(self): url = self.Stacked_extraction(Bundle().getString(43)) try: url = url[0] if Bundle().getString(44) in url or Bundle().getString(45) in url: return Murphy().noCapcha(url) except KeyError: pass except IndexError: pass except ValueError: pass
def Stacked_extraction(self, __target__): try: if self.is404(self.source): return "" if self.isBlocked(self.source) == False: content = html.fromstring(self.source) return content.xpath(__target__) else: return "" except etree.ParseError: raise Bundle().getString(47) except etree.XPathEvalError: raise Bundle().getString(47) + '\r\n' + __target__
def __new_ip__(self): try: with Controller.from_port(port=9051) as controller: controller.authenticate(password='******') socks.setdefaultproxy(proxy_type=socks.PROXY_TYPE_SOCKS5, addr="127.0.0.1", port=9050) controller.signal(Signal.NEWNYM) return True except Exception as ex: Logging(3, Bundle().getString(57), Bundle().getString(58) + '\n' + str(ex)) return False
def Navigate(self): try: __tor__().__new_ip__() Anonimity().__get_ip__() self.chrome.get(self.address) except exceptions.MaxRetryError: Bundle().getString(51) time.sleep(10) self.Navigate() Bundle().getString(52) except WebDriverException: raise Bundle().getString(53) except NoSuchWindowException: print("CLOSED")
def __init__(self, selector): self.delimiter = ":" self.delimiter2 = '~' self.newline = "\n" self.path = os.getcwd() + "/Spider/Extractor/Selectors/" + str( selector) + ".cort" if os.path.isfile(self.path) == False: Logging(2, Bundle().getString(60), Bundle().getString(61)) exit() print(self.path) self.stack = fs().Read(self.path) self.cluster = {'Keys': [], 'Values': []} self.__xpath__ = None self.end = False
def Read(self, path): if os.path.isfile(path): with open(path, "r", encoding="utf-8") as reader: return reader.read() else: Logging("Error", 'read', Bundle().getString(13)) return ""
def Start(self, source): if source != "" and self.isCaptcha(source) == False: self.source = source Bundle().getString(49) else: self.source = source self.setCaptcha()
def __init__(self, address, data, PROXY_ROTATION, AGENT_ROTATION): if address != "": Logging(1, Bundle().getString(22), address) self.address = address self.data = data self.PROXY_ROTATION = PROXY_ROTATION self.AGENT_ROTATION = AGENT_ROTATION self.OUTPUT = ''
def lxmlExtract(self, xpath: str): if xpath != None and self.chrome.Source() != None: try: content = html.fromstring(self.chrome.Source()) found = content.xpath(xpath) for item in found: try: if item != None: if item != "": return item else: return 1 except TypeError: return 1 except etree.ParseError: raise Bundle().getString(47) except etree.XPathEvalError: raise Bundle().getString(47) + '\r\n' + xpath
def isBlocked(self, source): if Bundle().getString(33) in source: Logging(4, Bundle().getString(22), Bundle().getString(34)) Logging(4, Bundle().getString(22), Bundle().getString(35)) Tor().__new_ip__() Clock().waitM(.5) return True else: Logging(1, Bundle().getString(22), Bundle().getString(32)) return False
def noCapcha(self, url): __IMG__ = __http__(url, "", True, True).getImage() captcha = pytesseract.image_to_string( Image.open(__IMG__), config='--psm 7').strip().replace(" ", "") Logging(2, Bundle().getString(58), captcha) return captcha
def getImage(self): try: with req.Session() as __session__: if self.PROXY_ROTATION: __tor__ = Tor() __tor__.__new_ip__() __session__.proxies = ProxyFactory().__tor_gen__() if self.AGENT_ROTATION: __session__.headers = HeadersFactory().__aws_random__() self.OUTPUT = __session__.get(self.address, stream=True).raw except exceptions.URLRequired: raise Bundle().getString("1") except exceptions.URLRequired: raise Bundle().getString("2") except exceptions.MissingSchema: raise Bundle().getString("3") except exceptions.InvalidSchema: raise Bundle().getString("4") except exceptions.TooManyRedirects: raise Bundle().getString("5") except exceptions.ConnectTimeout: raise Bundle().getString("6") except exceptions.Timeout: raise Bundle().getString("7") except exceptions.HTTPError: raise Bundle().getString("8") except exceptions.SSLError: raise Bundle().getString("9") except exceptions.ProxyError: raise Bundle().getString("10") except exceptions.ConnectionError: raise Bundle().getString("11") finally: if self.OUTPUT is not None and self.OUTPUT != "": return self.OUTPUT else: Logging(1, Bundle().getString(26), Bundle().getString(25)) return ""
def setCaptcha(self,captcha,_by): if captcha is not None and len(captcha) > 3: Clock().waitS(5) self.element.Input(Bundle().getString(46),captcha,_by)
def isCaptcha(self, source): if Bundle().getString(36) in source: return True else: return False