def __init__(self, first_url, limit, base_url, dep, parent_instance, storage, mode='simple', has_next_dep=True): """ :param limit:determine limit of saved page :param first_url:determine base url of first page :param base_url:determine base url :param dep:determine depth of crawl :param parent_instance:show instance of obligatory class :param storage:show storage path :param has_next_dep:boolean flag to determine status of having next depth or not :param mode:a flag to choose state of crawling from the list of [simple,advanced] """ self.first_url = first_url self.limit = limit self.xpaths = [] self.urls = [] self.has_next_dep = has_next_dep self.dep = dep self.parent_instance = parent_instance self.storage = storage self.robot = Hrobot(cookie_path=os.path.dirname(__file__) + '/test_cookie', base_url=base_url) self.robot.set_skip_image_loading(True) # add google ads to black list, this site generate bad request exception self.robot.block_url("googleads.g.doubleclick.net") self.base_url = None self.domain_hash = None self.mode = mode self.exceed_limit = False self.not_saved_url = [] self.files_url = []
def __init__(self, username, password): self.email = username self.password = password self.current_milli_time = lambda: int(round(time.time() * 1000)) self.unique_time = self.current_milli_time() cookie_path = os.path.dirname(__file__) self.robot = Hrobot(cookie_path, "http://cmd5.org")
def screenshot(data): params = Command.__get_data(data) if params: try: robot = Hrobot() robot.go_to(params.get('url')) name = uuid.uuid4().hex + '.png' robot.save_as_png(name) f = open(name, "rb") file_data = f.read() f.close() file_data = base64.b64encode(file_data) os.remove(name) return { 'data': file_data.decode(), 'status': constants.STATUS_SUCCESS } except Exception as e: return { 'data': 'url not found', 'status': constants.STATUS_ERROR } return { 'data': 'format or url is wrong', 'status': constants.STATUS_ERROR }
def run(self): # extract URL from json file parsed_data = self.params.get('data') if parsed_data is None: raise InvalidInputError('missing data keyword') # if input data has not method_id keyword raise exception if 'method_id' not in parsed_data.keys(): raise InvalidInputError('missing method_id keyword') if parsed_data['method_id'] == 1: if 'url' not in parsed_data.keys(): raise InvalidInputError('missing url keyword') robot = Hrobot() # go to URL try: if not (validators.url(parsed_data['url'])): raise InvalidInputError('invalid url') robot.go_to(parsed_data.get('url')) self.update_progressbar(" Opening URL: ", 50) name = uuid.uuid4().hex + '.png' # saving screenshot of URL to file robot.save_as_png(name) self.check_point() except InvalidResponseError: raise NetworkError('Unable to find the server') f = open(name, "rb") file_data = f.read() f.close() self.update_progressbar(" Saving photo ", 100) file_data = base64.b64encode(file_data) self.result = file_data os.remove(name) else: raise InvalidInputError('invalid method_id')
def __init__(self, parent=None): self.robot = Hrobot(None) self.robot.set_skip_image_loading(True) self.page = 3 self.parent = parent # type: BaseModule self.progressbar = {'state': 'initializing', 'percent': 0.0}
class GoogleSearch(object): def __init__(self, parent=None): self.robot = Hrobot(None) self.robot.set_skip_image_loading(True) self.page = 3 self.parent = parent # type: BaseModule self.progressbar = {'state': 'initializing', 'percent': 0.0} # update progressbar message and percent to suitable value def update_progressbar(self, message, percent): """ :param message: message of new state :param percent: total percent update progressbar value of request """ self.parent.progress = {'state': message, 'percent': percent} def search(self, query, pages=None): """ :param query: this parameter show searching term :param pages:the number of requested page for result of searching term :return: list of searching result in form of entity_property """ result_list = [] return_result = {} if pages: self.page = pages try: self.parent.check_point() # open google.com and set query value try: self.robot.go_to('https://www.google.com/ncr') except InvalidResponseError: raise NetworkError( 'Unable to find the server at www.google.com') query_field = self.robot.find_by_css("input[name='q']") if query_field is not None: query_field.set_value(query) query_field.get_form().submit() # iterate on number of excepted result, save one page on each iteration for i in range(self.page): self.parent.check_point() result1 = self.robot.find_by_xpath('//div[@id="search"]') result = result1.find_all_by_css('div[class="g"]') pagination = self.robot.find_by_xpath( "//*[contains(text(), 'Next')]") result_list.extend(self.parse_result(result)) # update progressbar value self.update_progressbar( 'Pages has been searched:' + str(i + 1), (100 * (i + 1) / self.page)) if pagination is not None: pagination.click() else: break if len(result_list) == 0: # if the following condition become true, we are faced captcha in search progress captcha_field1 = self.robot.find_by_xpath( '/html/body/div[1]/form/input[3]') captcha_field2 = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_TextBoxCode') if captcha_field1 or captcha_field2: raise CaptchaNeededError( "it is needed to resolve captcha") else: # no result found return_result["results"] = [{ "data": " ", "properties": [{ 'title': '', 'type': 0 }, { 'description': '', 'type': 0 }], "type": 1 }] return return_result return_result['results'] = result_list return return_result finally: self.robot.cleanup() @staticmethod def parse_result(unstructured_data): """ :param unstructured_data: list of result to parse :return: list of parsed result """ i = 0 final_result = [] resul = {} try: # creating data in json format for res in unstructured_data: properties = [] resul[i] = {'type': 1} # add data key to result if res.find_by_css('h3 a'): resul[i].update({ 'data': res.find_by_css('h3 a').get_attr("href").replace( "/url?q=", " ") }) else: resul[i]['data'] = '' # add data properties: description, title, type 0 to result if res.find_by_css('h3').get_text(): properties.append({ 'title': res.find_by_css('h3').get_text(), 'type': 0 }) else: properties.append({'title': '', 'type': 0}) if res.find_by_css('span[class="st"]'): properties.append({ 'description': res.find_by_css('div [class="st"]').get_text(), 'type': 0 }) else: properties.append({'description': '', 'type': 0}) resul[i]['properties'] = properties final_result.append(resul[i]) i = i + 1 return final_result except Exception as e: raise InternalModuleError('bad content to parse' + e)
def __init__(self, parent=None): self.robot = Hrobot(None) self.robot.set_skip_image_loading(True) self.page = 3 self.parent = parent # type: BaseModule
class BingSearch: def __init__(self, parent=None): self.robot = Hrobot(None) self.robot.set_skip_image_loading(True) self.page = 3 self.parent = parent # type: BaseModule def update_progressbar(self, message, percent): """ :param message: message of new state :param percent: total percent update progressbar value of request """ self.parent.progress = {'state': message, 'percent': percent} def search(self, query, pages=None): """ :param query: this parameter show searching term :param pages: the number of requested page for result of searching term :return:list of searching result in form of entity_property """ final_result = [] return_result = {} if pages: self.page = pages try: # open bing.com and set query value try: self.robot.go_to('https://www.bing.com') except InvalidResponseError: raise NetworkError('Unable to find the server at www.bing.com') query_field = self.robot.find_by_xpath('//*[@id="sb_form_q"]') if query_field: query_field.set_value(query) search_button = self.robot.find_by_xpath('//*[@id="sb_form_go"]') search_button.click() # iterate on number of excepted result, save one page on each iteration for i in range(self.page): self.parent.check_point() result = self.robot.find_by_xpath('/html/body/div[1]') res = result.find_all_by_css('li[class="b_algo"]') final_result.extend(self.parse_result(res)) pagination = self.robot.find_by_xpath("//a[@title='Next page']") if pagination: pagination.click() # update progressbar self.update_progressbar(" Pages has been searched: " + str(i + 1), (100 * (i + 1) / self.page)) # no result found if len(final_result) == 0: return_result["results"] = [ {"data": " ", "properties": [{'title': '', 'type': 0}, {'description': '', 'type': 0}], "type": 1}] return return_result return_result["results"] = final_result return return_result finally: self.robot.cleanup() @staticmethod def parse_result(unstructured_data): """ :param unstructured_data:list of result to parse :return:list of parsed result """ i = 0 final_result = [] resul = {} try: # creating data in json format for res in unstructured_data: properties = [] resul[i] = {'type': 1} # add data key to result if res.find_by_css('h2 a').get_attr("href"): resul[i].update({'data': res.find_by_css('h2 a').get_attr("href")}) else: resul[i]['data'] = "" # add data properties: description, title, type 0 to result if res.find_by_css('h2').get_text(): properties.append({'title': res.find_by_css('h2').get_text(), "type": 0}) else: properties.append({'title': "", "type": 0}) if res.find_by_css('div p').get_text(): properties.append({'description': res.find_by_css('div p').get_text(), "type": 0}) else: properties.append({'description': "", "type": 0}) resul[i]["properties"] = properties final_result.append(resul[i]) i = i + 1 return final_result except Exception as e: raise InternalModuleError('bad content' + str(e))
class Processor(object): def __init__(self, first_url, limit, base_url, dep, parent_instance, storage, mode='simple', has_next_dep=True): """ :param limit:determine limit of saved page :param first_url:determine base url of first page :param base_url:determine base url :param dep:determine depth of crawl :param parent_instance:show instance of obligatory class :param storage:show storage path :param has_next_dep:boolean flag to determine status of having next depth or not :param mode:a flag to choose state of crawling from the list of [simple,advanced] """ self.first_url = first_url self.limit = limit self.xpaths = [] self.urls = [] self.has_next_dep = has_next_dep self.dep = dep self.parent_instance = parent_instance self.storage = storage self.robot = Hrobot(cookie_path=os.path.dirname(__file__) + '/test_cookie', base_url=base_url) self.robot.set_skip_image_loading(True) # add google ads to black list, this site generate bad request exception self.robot.block_url("googleads.g.doubleclick.net") self.base_url = None self.domain_hash = None self.mode = mode self.exceed_limit = False self.not_saved_url = [] self.files_url = [] def __reload(self): self.robot.go_to(self.base_url) def __invalid_page(self): return len(self.robot.find_by_css('body').get_text()) < 30 def __url_changed(self): return not urltools.compare(self.base_url, self.robot.get_url()) def __find_clickables(self): if self.mode == "advance": css_selector_list = ['a', 'div'] else: css_selector_list = ['a'] for selector in css_selector_list: nodes = self.robot.find_all_by_css(selector) self.__save_xpath(nodes) def __save_xpath(self, nodes): """ :param nodes: show list of nodes :return: list of xpath for input nodes """ for node in nodes: xpath = node.get_xpath() if xpath not in self.xpaths: self.xpaths.append(xpath) def __compare_attrs(self, old_attrs, new_attrs): """ :param old_attrs: determine first attribute :param new_attrs: determine second attribute :return: result of comparing two attribute in the form of boolean """ attrs = ['href', 'id', 'class', 'part_of_text'] for attr in attrs: if old_attrs.get(attr) != new_attrs.get(attr): return False return True def __try_find_node(self, xpath, timeout=5): """ :param xpath:determine xpath of one node :param timeout: show timeout :return: node related to input xpath """ timeout = timeout if timeout > 0 else 1 timeout = timeout / 2 node = self.robot.wait_until_xpath(xpath, timeout) if not node: self.__reload() node = self.robot.wait_until_xpath(xpath, timeout) return node return node def __find_url(self): for xpath in self.xpaths: self.parent_instance.check_point() try: node = self.__try_find_node(xpath, 2) if node: xpath_black_list = self.parent_instance.get_xpath_blacklist() xpath_hash = hashlib.md5(xpath.encode('utf-8')).hexdigest() black_list_group = xpath_black_list.get(self.domain_hash, None) node.set_attr('target', '_self') attrs = {'href': node.get_attr('href'), 'id': node.get_attr('id'), 'class': node.get_attr('class'), 'part_of_text': node.get_text()[:20]} if black_list_group and xpath_hash in black_list_group and self.__compare_attrs(attrs, black_list_group[ xpath_hash]): continue else: self.parent_instance.add_xpath_blacklist(xpath_hash, attrs, self.domain_hash) self.robot.set_unknown_url_mode('block') node.eval_script( 'node.click()') # FIXME: may be need to use physical pointer and js click as redundancy self.robot.set_unknown_url_mode('warn') url = urltools.normalize(urllib.parse.unquote(self.robot.get_url())) if self.__url_changed(): # new section faces invalid page, include: 404, not found, access denied if self.__invalid_page(): self.not_saved_url.append(url) self.__go_back() continue if url not in self.urls: if self.first_url is None: self.urls.append(url) else: # in case of just crawling pages from base url data = urlparse(url) base_current_url = data.scheme + '://' + data.netloc if urltools.compare(self.first_url, base_current_url): self.urls.append(url) else: pass # FIXME: if click action not change url, maybe ajax content loading happen, if self.__url_changed() or self.__invalid_page(): self.__go_back() except Exception as msg: pass # cant find url of this node, because of timeout ro some similar error def __go_back(self): self.robot.go_back() if self.__url_changed() or self.__invalid_page(): self.__reload() # reload page for fix url_replace with javascript # (url_replace clear history and go_back will be failed) def __save_html(self): base_path = self.storage[0] relative_path = self.storage[1] full_path = base_path + '/' + relative_path + '/' file = str(uuid.uuid4()) + '.html' os.makedirs(full_path, exist_ok=True) self.parent_instance.add_html_list({ 'type': 9, 'data': base_path + '/' + relative_path + '/' + file, 'properties': [{'url': self.base_url, 'type': 1}], 'ref': { 'task': 'crawl', 'depth': self.dep } } ) html = open(full_path + '/' + file, 'w+') html.write(self.robot.get_body()) html.close() if len(self.parent_instance.html_list) > self.limit: self.exceed_limit = True raise CrawlLimit('reach crawler limit: ', self.limit) def get_urls(self): try: # todo: if base_url is file_url, append that to file_url list, don't save as html # if self.base_url.endswith('file format'): # file_urls.append(self.base_urls) # else: self.robot.go_to() self.base_url = urltools.normalize(urllib.parse.unquote( self.robot.get_url())) # assign base_url here, because actual url may not # equal to url requested by user. for example: redirect to login page if self.has_next_dep: domain_slit = urltools.parse(self.base_url) domain = domain_slit.domain + '.' + domain_slit.tld self.domain_hash = hashlib.md5(domain.encode('utf-8')).hexdigest() self.__find_clickables() self.__save_html() self.__find_url() else: self.__save_html() except Exception: # can not save this url to html, so add this url to crashed url self.not_saved_url.append(self.base_url) finally: self.robot.cleanup() if self.exceed_limit: raise CrawlLimit('reach crawler limit: ', self.limit) if not self.parent_instance.html_list: raise ResultNotFoundError return self.urls
class Hash(object): def __init__(self, username, password): self.email = username self.password = password self.current_milli_time = lambda: int(round(time.time() * 1000)) self.unique_time = self.current_milli_time() cookie_path = os.path.dirname(__file__) self.robot = Hrobot(cookie_path, "http://cmd5.org") def is_logeed_in(self): self.unique_time = self.current_milli_time() if self.robot.find_by_xpath('//a[@href="exit.aspx"]') is not None: self.robot.save_cookies_to_file(self.robot.get_cookies()) return True else: self.set_cookie() if self.robot.find_by_xpath('//a[@href="exit.aspx"]') is not None: self.robot.save_cookies_to_file(self.robot.get_cookies()) return True return False def set_cookie(self): for cookie in self.robot.load_cookies_from_file(): self.robot.set_cookie(cookie) self.robot.set_timeout(30) self.robot.go_to('/') def login(self): if self.is_logeed_in(): ApiLogging.info('cookie login') return True else: ApiLogging.info('captcha login') self.robot.go_to('/login.aspx') email_field = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_TextBoxCmd5_E') password_field = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_TextBoxCmd5_P') email_field.set_value(self.email) password_field.set_value(self.password) self.fill_captcha_if_needed() submit_button = self.robot.find_by_css( "#ctl00_ContentPlaceHolder1_Button1") submit_button.click() self.robot.save_cookies_to_file(self.robot.get_cookies()) if self.is_logeed_in(): ApiLogging.info('logged in') return True return False def decode(self, hash_type, hash_code): if self.login(): hash_field = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_TextBoxInput') if hash_field is not None: type_field = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_InputHashType') hash_field.set_value(hash_code) type_field.set_value(hash_type) self.fill_captcha_if_needed() submit_button = self.robot.find_by_css( "#ctl00_ContentPlaceHolder1_Button1") submit_button.click() result = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_LabelAnswer') ApiLogging.info("result in hash: %s" % result.get_text()) ApiLogging.info('type: ' + str(hash_type) + ' code: ' + str(hash_code)) chk_result = self.check_result(result) if chk_result == VERIFY: self.decode(hash_type, hash_code) elif chk_result == PAYMENT: pr = self.robot.find_by_contain_text('a', 'Purchase') ApiLogging.info('click payment' + str(pr.get_text())) if pr: pr.click() result = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_LabelAnswer') chk_result = self.check_result(result) if chk_result is None: return result.get_text() elif chk_result == NOT_FOUND: return None else: return result.get_text().split('\n')[0] else: ApiLogging.warning('login fail') def check_result(self, result): if result.get_text() == 'Verify code error!': return VERIFY elif 'payment' in result.get_text(): ApiLogging.info('found payment') return PAYMENT elif 'Not Found' in result.get_text(): return NOT_FOUND else: return None def fill_captcha_if_needed(self): captcha_field = self.robot.find_by_css( '#ctl00_ContentPlaceHolder1_TextBoxCode') if captcha_field is not None: ApiLogging.warning('captcha needed') self.robot.set_viewport_size(1280, 800) img = self.robot.find_by_css("#Image1") rect = img.get_position() box = (int(rect['left']), int(rect['top']), int(rect['right']), int(rect['bottom'])) filename = tempfile.mktemp('.png') self.robot.save_as_png(filename, 1280, 800) image = Image.open(filename) os.unlink(filename) captcha_image = image.crop(box) captcha_image.save('%s.png' % self.unique_time, 'png') captcha_field.set_value( self.resolve_captcha('%s.png' % self.unique_time)) os.remove('%s.png' % self.unique_time) def resolve_captcha(self, file): api_key = "2632143214b9b24e9dc7590396f1dd22" captcha_object = CaptchaUpload(key=api_key, waittime=3) captcha = captcha_object.solve(file) ApiLogging.info('finded capcha: ' + str(captcha)) return captcha @staticmethod def get_result_by_api(api_key, email, hash_code): url = 'https://www.cmd5.org/api.ashx?email=' + email + '&key=' + api_key + '&hash=' + hash_code result = Qhttp.get(url) if result.status_code == 200: if ':' in result.content.decode(): error_code = result.content.decode().split(':')[-1] if error_code == '-1': raise InvalidInputError(' invalid input ') if error_code == '-2': raise InsufficientCredit('InsufficientCredit') if error_code == '-3': raise NetworkError('server failed on cmd5.org') if error_code == '-4': raise InvalidInputError('unknown sipher text') if error_code == '-7': raise InvalidInputError('hash type not supported') if error_code == '-999': raise NetworkError('some thing wrong with cmd5.org') try: return_result = {'results': result.json()} return return_result except Exception: ResultNotFoundError(' unknown result format ') else: raise NetworkError(result.status_code)
import time from extensions.hrobot.v_1_0.Hrobot import Hrobot robot = Hrobot('.', 'https://www.linkedin.com') robot.go_to('/') # robot.save_as_png('/home/dpe/Desktop/tttt.png') email = robot.find_by_css("input[id='login-email']") password = robot.find_by_css("input[id='login-password']") log_in = robot.find_by_css("input[id='login-submit']") email.set_value('*****@*****.**') password.set_value('thai2$1dfrg5d@Hivai') if log_in: robot.save_as_png('before_login.png') password.submit() robot.go_to('/feed/') #robot.wait(10000) robot.go_to('/feed/') time.sleep(10) print('log in') robot.save_as_png('login.png') print(robot.get_url()) robot.go_to('/in/ramin-fatourehchi-39931a9a') robot.save_as_png('tttt.png') print(robot.find_by_contain_text('*', 'dana insurance ')) print(robot.find_by_css('#ember3655 > div:nth-child(2) > h3:nth-child(1)'))
import tempfile import time import os from PIL import Image from components.utils import ApiLogging from extensions.hrobot.v_1_0.Hrobot import Hrobot from test.cmd.captcha2upload import CaptchaUpload email = '*****@*****.**' password = '******' current_milli_time = lambda: int(round(time.time() * 1000)) unique_time = current_milli_time() cookie_path = os.path.dirname(__file__) robot = Hrobot(cookie_path,"http://cmd5.org") def log(txt): f = open("result.txt", "a") f.write("%s\n" % txt) f.close() def is_logeed_in(): global unique_time unique_time = current_milli_time() if robot.find_by_xpath('//a[@href="exit.aspx"]') is not None: robot.save_cookies_to_file(robot.get_cookies()) return True