Exemple #1
0
 def __init__(self, first_url, limit, base_url, dep, parent_instance, storage, mode='simple', has_next_dep=True):
     """
     :param limit:determine limit of saved page
     :param first_url:determine base url of first page
     :param base_url:determine base url
     :param dep:determine depth of crawl
     :param parent_instance:show instance of obligatory class
     :param storage:show storage path
     :param has_next_dep:boolean flag to determine status of having next depth or not
     :param mode:a flag to choose state of crawling from the list of [simple,advanced]
     """
     self.first_url = first_url
     self.limit = limit
     self.xpaths = []
     self.urls = []
     self.has_next_dep = has_next_dep
     self.dep = dep
     self.parent_instance = parent_instance
     self.storage = storage
     self.robot = Hrobot(cookie_path=os.path.dirname(__file__) + '/test_cookie', base_url=base_url)
     self.robot.set_skip_image_loading(True)
     # add google ads to black list, this site generate bad request exception
     self.robot.block_url("googleads.g.doubleclick.net")
     self.base_url = None
     self.domain_hash = None
     self.mode = mode
     self.exceed_limit = False
     self.not_saved_url = []
     self.files_url = []
Exemple #2
0
 def __init__(self, username, password):
     self.email = username
     self.password = password
     self.current_milli_time = lambda: int(round(time.time() * 1000))
     self.unique_time = self.current_milli_time()
     cookie_path = os.path.dirname(__file__)
     self.robot = Hrobot(cookie_path, "http://cmd5.org")
Exemple #3
0
 def screenshot(data):
     params = Command.__get_data(data)
     if params:
         try:
             robot = Hrobot()
             robot.go_to(params.get('url'))
             name = uuid.uuid4().hex + '.png'
             robot.save_as_png(name)
             f = open(name, "rb")
             file_data = f.read()
             f.close()
             file_data = base64.b64encode(file_data)
             os.remove(name)
             return {
                 'data': file_data.decode(),
                 'status': constants.STATUS_SUCCESS
             }
         except Exception as e:
             return {
                 'data': 'url not found',
                 'status': constants.STATUS_ERROR
             }
     return {
         'data': 'format or url is wrong',
         'status': constants.STATUS_ERROR
     }
Exemple #4
0
    def run(self):
        # extract URL from json file
        parsed_data = self.params.get('data')
        if parsed_data is None:
            raise InvalidInputError('missing data keyword')
        # if input data has not method_id keyword raise exception
        if 'method_id' not in parsed_data.keys():
            raise InvalidInputError('missing method_id keyword')
        if parsed_data['method_id'] == 1:
            if 'url' not in parsed_data.keys():
                raise InvalidInputError('missing url keyword')

            robot = Hrobot()
            # go to URL
            try:
                if not (validators.url(parsed_data['url'])):
                    raise InvalidInputError('invalid url')
                robot.go_to(parsed_data.get('url'))
                self.update_progressbar(" Opening URL: ", 50)
                name = uuid.uuid4().hex + '.png'
                # saving screenshot of URL to file
                robot.save_as_png(name)
                self.check_point()

            except InvalidResponseError:
                raise NetworkError('Unable to find the server')
            f = open(name, "rb")
            file_data = f.read()
            f.close()
            self.update_progressbar(" Saving photo ", 100)
            file_data = base64.b64encode(file_data)
            self.result = file_data
            os.remove(name)
        else:
            raise InvalidInputError('invalid method_id')
 def __init__(self, parent=None):
     self.robot = Hrobot(None)
     self.robot.set_skip_image_loading(True)
     self.page = 3
     self.parent = parent  # type: BaseModule
     self.progressbar = {'state': 'initializing', 'percent': 0.0}
class GoogleSearch(object):
    def __init__(self, parent=None):
        self.robot = Hrobot(None)
        self.robot.set_skip_image_loading(True)
        self.page = 3
        self.parent = parent  # type: BaseModule
        self.progressbar = {'state': 'initializing', 'percent': 0.0}

    # update progressbar message and percent to suitable value
    def update_progressbar(self, message, percent):
        """
        :param message: message of new state
        :param percent: total percent
        update progressbar value of request
        """
        self.parent.progress = {'state': message, 'percent': percent}

    def search(self, query, pages=None):
        """
        :param query: this parameter show searching term
        :param pages:the number of requested page for result of searching term
        :return: list of searching result in form of entity_property
        """
        result_list = []
        return_result = {}

        if pages:
            self.page = pages
        try:
            self.parent.check_point()
            # open google.com and set query value
            try:
                self.robot.go_to('https://www.google.com/ncr')
            except InvalidResponseError:
                raise NetworkError(
                    'Unable to find the server at www.google.com')
            query_field = self.robot.find_by_css("input[name='q']")
            if query_field is not None:
                query_field.set_value(query)
                query_field.get_form().submit()
                # iterate on number of excepted result, save one page on each iteration
                for i in range(self.page):
                    self.parent.check_point()
                    result1 = self.robot.find_by_xpath('//div[@id="search"]')
                    result = result1.find_all_by_css('div[class="g"]')
                    pagination = self.robot.find_by_xpath(
                        "//*[contains(text(), 'Next')]")
                    result_list.extend(self.parse_result(result))
                    # update progressbar value
                    self.update_progressbar(
                        'Pages has been searched:' + str(i + 1),
                        (100 * (i + 1) / self.page))
                    if pagination is not None:
                        pagination.click()
                    else:
                        break
            if len(result_list) == 0:
                # if the following condition become true, we are faced captcha in search progress
                captcha_field1 = self.robot.find_by_xpath(
                    '/html/body/div[1]/form/input[3]')
                captcha_field2 = self.robot.find_by_css(
                    '#ctl00_ContentPlaceHolder1_TextBoxCode')
                if captcha_field1 or captcha_field2:
                    raise CaptchaNeededError(
                        "it is needed to resolve  captcha")
                else:
                    # no result found
                    return_result["results"] = [{
                        "data":
                        " ",
                        "properties": [{
                            'title': '',
                            'type': 0
                        }, {
                            'description': '',
                            'type': 0
                        }],
                        "type":
                        1
                    }]
                    return return_result

            return_result['results'] = result_list
            return return_result
        finally:
            self.robot.cleanup()

    @staticmethod
    def parse_result(unstructured_data):
        """

       :param unstructured_data: list of result to parse
       :return: list of parsed result
       """

        i = 0
        final_result = []
        resul = {}
        try:
            # creating data in json format
            for res in unstructured_data:
                properties = []
                resul[i] = {'type': 1}
                # add data key to result
                if res.find_by_css('h3 a'):
                    resul[i].update({
                        'data':
                        res.find_by_css('h3 a').get_attr("href").replace(
                            "/url?q=", " ")
                    })
                else:
                    resul[i]['data'] = ''

                # add data properties: description, title, type 0 to result
                if res.find_by_css('h3').get_text():
                    properties.append({
                        'title': res.find_by_css('h3').get_text(),
                        'type': 0
                    })
                else:
                    properties.append({'title': '', 'type': 0})
                if res.find_by_css('span[class="st"]'):
                    properties.append({
                        'description':
                        res.find_by_css('div [class="st"]').get_text(),
                        'type':
                        0
                    })
                else:
                    properties.append({'description': '', 'type': 0})
                resul[i]['properties'] = properties
                final_result.append(resul[i])

                i = i + 1

            return final_result
        except Exception as e:
            raise InternalModuleError('bad content to parse' + e)
Exemple #7
0
 def __init__(self, parent=None):
     self.robot = Hrobot(None)
     self.robot.set_skip_image_loading(True)
     self.page = 3
     self.parent = parent  # type: BaseModule
Exemple #8
0
class BingSearch:
    def __init__(self, parent=None):
        self.robot = Hrobot(None)
        self.robot.set_skip_image_loading(True)
        self.page = 3
        self.parent = parent  # type: BaseModule

    def update_progressbar(self, message, percent):
        """
        :param message: message of new state
        :param percent: total percent
        update progressbar value of request
        """
        self.parent.progress = {'state': message, 'percent': percent}

    def search(self, query, pages=None):
        """

        :param query: this parameter show searching term
        :param pages: the number of requested page for result of searching term
        :return:list of searching result in form of entity_property
        """
        final_result = []
        return_result = {}
        if pages:
            self.page = pages
        try:
            # open bing.com and set query value
            try:
                self.robot.go_to('https://www.bing.com')
            except InvalidResponseError:
                raise NetworkError('Unable to find the server at www.bing.com')
            query_field = self.robot.find_by_xpath('//*[@id="sb_form_q"]')
            if query_field:
                query_field.set_value(query)
                search_button = self.robot.find_by_xpath('//*[@id="sb_form_go"]')
                search_button.click()
                # iterate on number of excepted result, save one page on each iteration
                for i in range(self.page):
                    self.parent.check_point()
                    result = self.robot.find_by_xpath('/html/body/div[1]')
                    res = result.find_all_by_css('li[class="b_algo"]')
                    final_result.extend(self.parse_result(res))
                    pagination = self.robot.find_by_xpath("//a[@title='Next page']")
                    if pagination:
                        pagination.click()
                    # update progressbar
                    self.update_progressbar(" Pages has been searched: " + str(i + 1),
                                            (100 * (i + 1) / self.page))

                # no result found
                if len(final_result) == 0:
                    return_result["results"] = [
                        {"data": " ", "properties": [{'title': '', 'type': 0}, {'description': '', 'type': 0}],
                         "type": 1}]
                    return return_result
            return_result["results"] = final_result
            return return_result

        finally:
            self.robot.cleanup()

    @staticmethod
    def parse_result(unstructured_data):
        """

        :param unstructured_data:list of result to parse
        :return:list of parsed result
        """
        i = 0
        final_result = []
        resul = {}
        try:
            # creating data in json format
            for res in unstructured_data:
                properties = []
                resul[i] = {'type': 1}
                # add data key to result
                if res.find_by_css('h2 a').get_attr("href"):
                    resul[i].update({'data': res.find_by_css('h2 a').get_attr("href")})
                else:
                    resul[i]['data'] = ""

                # add data properties: description, title, type 0 to result
                if res.find_by_css('h2').get_text():
                    properties.append({'title': res.find_by_css('h2').get_text(), "type": 0})
                else:
                    properties.append({'title': "", "type": 0})
                if res.find_by_css('div p').get_text():
                    properties.append({'description': res.find_by_css('div p').get_text(), "type": 0})
                else:
                    properties.append({'description': "", "type": 0})

                resul[i]["properties"] = properties
                final_result.append(resul[i])
                i = i + 1

            return final_result
        except Exception as e:
            raise InternalModuleError('bad content' + str(e))
Exemple #9
0
class Processor(object):
    def __init__(self, first_url, limit, base_url, dep, parent_instance, storage, mode='simple', has_next_dep=True):
        """
        :param limit:determine limit of saved page
        :param first_url:determine base url of first page
        :param base_url:determine base url
        :param dep:determine depth of crawl
        :param parent_instance:show instance of obligatory class
        :param storage:show storage path
        :param has_next_dep:boolean flag to determine status of having next depth or not
        :param mode:a flag to choose state of crawling from the list of [simple,advanced]
        """
        self.first_url = first_url
        self.limit = limit
        self.xpaths = []
        self.urls = []
        self.has_next_dep = has_next_dep
        self.dep = dep
        self.parent_instance = parent_instance
        self.storage = storage
        self.robot = Hrobot(cookie_path=os.path.dirname(__file__) + '/test_cookie', base_url=base_url)
        self.robot.set_skip_image_loading(True)
        # add google ads to black list, this site generate bad request exception
        self.robot.block_url("googleads.g.doubleclick.net")
        self.base_url = None
        self.domain_hash = None
        self.mode = mode
        self.exceed_limit = False
        self.not_saved_url = []
        self.files_url = []

    def __reload(self):
        self.robot.go_to(self.base_url)

    def __invalid_page(self):
        return len(self.robot.find_by_css('body').get_text()) < 30

    def __url_changed(self):
        return not urltools.compare(self.base_url, self.robot.get_url())

    def __find_clickables(self):
        if self.mode == "advance":
            css_selector_list = ['a', 'div']
        else:
            css_selector_list = ['a']
        for selector in css_selector_list:
            nodes = self.robot.find_all_by_css(selector)
            self.__save_xpath(nodes)

    def __save_xpath(self, nodes):
        """

        :param nodes: show list of nodes
        :return: list of xpath for input nodes
        """
        for node in nodes:
            xpath = node.get_xpath()
            if xpath not in self.xpaths:
                self.xpaths.append(xpath)

    def __compare_attrs(self, old_attrs, new_attrs):
        """

        :param old_attrs: determine first attribute
        :param new_attrs: determine second attribute
        :return: result of comparing two attribute in the form of boolean
        """
        attrs = ['href', 'id', 'class', 'part_of_text']
        for attr in attrs:
            if old_attrs.get(attr) != new_attrs.get(attr):
                return False
        return True

    def __try_find_node(self, xpath, timeout=5):
        """

        :param xpath:determine xpath of one node
        :param timeout: show timeout
        :return: node related to input xpath
        """
        timeout = timeout if timeout > 0 else 1
        timeout = timeout / 2
        node = self.robot.wait_until_xpath(xpath, timeout)
        if not node:
            self.__reload()
            node = self.robot.wait_until_xpath(xpath, timeout)
            return node
        return node

    def __find_url(self):
        for xpath in self.xpaths:
            self.parent_instance.check_point()
            try:
                node = self.__try_find_node(xpath, 2)
                if node:
                    xpath_black_list = self.parent_instance.get_xpath_blacklist()
                    xpath_hash = hashlib.md5(xpath.encode('utf-8')).hexdigest()
                    black_list_group = xpath_black_list.get(self.domain_hash, None)
                    node.set_attr('target', '_self')
                    attrs = {'href': node.get_attr('href'), 'id': node.get_attr('id'), 'class': node.get_attr('class'),
                             'part_of_text': node.get_text()[:20]}
                    if black_list_group and xpath_hash in black_list_group and self.__compare_attrs(attrs,
                                                                                                    black_list_group[
                                                                                                        xpath_hash]):
                        continue
                    else:
                        self.parent_instance.add_xpath_blacklist(xpath_hash, attrs, self.domain_hash)
                        self.robot.set_unknown_url_mode('block')
                        node.eval_script(
                            'node.click()')  # FIXME: may be need to use physical pointer and js click as redundancy
                        self.robot.set_unknown_url_mode('warn')
                        url = urltools.normalize(urllib.parse.unquote(self.robot.get_url()))
                        if self.__url_changed():
                            # new section faces invalid page, include: 404, not found, access denied
                            if self.__invalid_page():
                                self.not_saved_url.append(url)
                                self.__go_back()
                                continue

                            if url not in self.urls:
                                if self.first_url is None:
                                    self.urls.append(url)
                                else:  # in case of just crawling pages from base url
                                    data = urlparse(url)
                                    base_current_url = data.scheme + '://' + data.netloc
                                    if urltools.compare(self.first_url, base_current_url):
                                        self.urls.append(url)
                        else:
                            pass  # FIXME: if click action not change url, maybe ajax content loading happen,

                        if self.__url_changed() or self.__invalid_page():
                            self.__go_back()

            except Exception as msg:
                pass  # cant find url of this node, because of timeout ro some similar error

    def __go_back(self):
        self.robot.go_back()
        if self.__url_changed() or self.__invalid_page():
            self.__reload()  # reload page for fix url_replace with javascript
            # (url_replace clear history and go_back will be failed)

    def __save_html(self):
        base_path = self.storage[0]
        relative_path = self.storage[1]
        full_path = base_path + '/' + relative_path + '/'
        file = str(uuid.uuid4()) + '.html'
        os.makedirs(full_path, exist_ok=True)
        self.parent_instance.add_html_list({
            'type': 9,
            'data': base_path + '/' + relative_path + '/'
                    + file,
            'properties': [{'url': self.base_url, 'type': 1}],
            'ref': {
                'task': 'crawl',

                'depth': self.dep
            }
        }
        )
        html = open(full_path + '/' + file, 'w+')
        html.write(self.robot.get_body())
        html.close()
        if len(self.parent_instance.html_list) > self.limit:
            self.exceed_limit = True
            raise CrawlLimit('reach crawler limit: ', self.limit)

    def get_urls(self):
        try:
            # todo: if base_url is file_url, append that to file_url list, don't save as html
            # if self.base_url.endswith('file format'):
            #   file_urls.append(self.base_urls)
            # else:
            self.robot.go_to()
            self.base_url = urltools.normalize(urllib.parse.unquote(
                self.robot.get_url()))  # assign base_url here, because actual url may not
            #  equal to url requested by user. for example: redirect to login page
            if self.has_next_dep:
                domain_slit = urltools.parse(self.base_url)
                domain = domain_slit.domain + '.' + domain_slit.tld
                self.domain_hash = hashlib.md5(domain.encode('utf-8')).hexdigest()
                self.__find_clickables()
                self.__save_html()
                self.__find_url()
            else:
                self.__save_html()
        except Exception:
            # can not save this url to html, so add this url to crashed url
            self.not_saved_url.append(self.base_url)
        finally:
            self.robot.cleanup()
            if self.exceed_limit:
                raise CrawlLimit('reach crawler limit: ', self.limit)
            if not self.parent_instance.html_list:
                raise ResultNotFoundError
            return self.urls
Exemple #10
0
class Hash(object):
    def __init__(self, username, password):
        self.email = username
        self.password = password
        self.current_milli_time = lambda: int(round(time.time() * 1000))
        self.unique_time = self.current_milli_time()
        cookie_path = os.path.dirname(__file__)
        self.robot = Hrobot(cookie_path, "http://cmd5.org")

    def is_logeed_in(self):
        self.unique_time = self.current_milli_time()
        if self.robot.find_by_xpath('//a[@href="exit.aspx"]') is not None:
            self.robot.save_cookies_to_file(self.robot.get_cookies())
            return True
        else:
            self.set_cookie()
            if self.robot.find_by_xpath('//a[@href="exit.aspx"]') is not None:
                self.robot.save_cookies_to_file(self.robot.get_cookies())
                return True
        return False

    def set_cookie(self):
        for cookie in self.robot.load_cookies_from_file():
            self.robot.set_cookie(cookie)
        self.robot.set_timeout(30)
        self.robot.go_to('/')

    def login(self):
        if self.is_logeed_in():
            ApiLogging.info('cookie login')
            return True
        else:
            ApiLogging.info('captcha login')
            self.robot.go_to('/login.aspx')
            email_field = self.robot.find_by_css(
                '#ctl00_ContentPlaceHolder1_TextBoxCmd5_E')
            password_field = self.robot.find_by_css(
                '#ctl00_ContentPlaceHolder1_TextBoxCmd5_P')
            email_field.set_value(self.email)
            password_field.set_value(self.password)
            self.fill_captcha_if_needed()
            submit_button = self.robot.find_by_css(
                "#ctl00_ContentPlaceHolder1_Button1")
            submit_button.click()
            self.robot.save_cookies_to_file(self.robot.get_cookies())
            if self.is_logeed_in():
                ApiLogging.info('logged in')
                return True
        return False

    def decode(self, hash_type, hash_code):
        if self.login():
            hash_field = self.robot.find_by_css(
                '#ctl00_ContentPlaceHolder1_TextBoxInput')
            if hash_field is not None:
                type_field = self.robot.find_by_css(
                    '#ctl00_ContentPlaceHolder1_InputHashType')
                hash_field.set_value(hash_code)
                type_field.set_value(hash_type)
                self.fill_captcha_if_needed()
                submit_button = self.robot.find_by_css(
                    "#ctl00_ContentPlaceHolder1_Button1")
                submit_button.click()
                result = self.robot.find_by_css(
                    '#ctl00_ContentPlaceHolder1_LabelAnswer')
                ApiLogging.info("result in hash: %s" % result.get_text())
                ApiLogging.info('type: ' + str(hash_type) + ' code: ' +
                                str(hash_code))
                chk_result = self.check_result(result)
                if chk_result == VERIFY:
                    self.decode(hash_type, hash_code)
                elif chk_result == PAYMENT:
                    pr = self.robot.find_by_contain_text('a', 'Purchase')
                    ApiLogging.info('click payment' + str(pr.get_text()))
                    if pr:
                        pr.click()
                    result = self.robot.find_by_css(
                        '#ctl00_ContentPlaceHolder1_LabelAnswer')
                    chk_result = self.check_result(result)
                    if chk_result is None:
                        return result.get_text()
                elif chk_result == NOT_FOUND:
                    return None
                else:
                    return result.get_text().split('\n')[0]

        else:
            ApiLogging.warning('login fail')

    def check_result(self, result):
        if result.get_text() == 'Verify code error!':
            return VERIFY
        elif 'payment' in result.get_text():
            ApiLogging.info('found payment')
            return PAYMENT
        elif 'Not Found' in result.get_text():
            return NOT_FOUND
        else:
            return None

    def fill_captcha_if_needed(self):
        captcha_field = self.robot.find_by_css(
            '#ctl00_ContentPlaceHolder1_TextBoxCode')
        if captcha_field is not None:
            ApiLogging.warning('captcha needed')
            self.robot.set_viewport_size(1280, 800)
            img = self.robot.find_by_css("#Image1")
            rect = img.get_position()
            box = (int(rect['left']), int(rect['top']), int(rect['right']),
                   int(rect['bottom']))
            filename = tempfile.mktemp('.png')
            self.robot.save_as_png(filename, 1280, 800)
            image = Image.open(filename)
            os.unlink(filename)
            captcha_image = image.crop(box)
            captcha_image.save('%s.png' % self.unique_time, 'png')
            captcha_field.set_value(
                self.resolve_captcha('%s.png' % self.unique_time))
            os.remove('%s.png' % self.unique_time)

    def resolve_captcha(self, file):
        api_key = "2632143214b9b24e9dc7590396f1dd22"
        captcha_object = CaptchaUpload(key=api_key, waittime=3)
        captcha = captcha_object.solve(file)
        ApiLogging.info('finded capcha: ' + str(captcha))
        return captcha

    @staticmethod
    def get_result_by_api(api_key, email, hash_code):
        url = 'https://www.cmd5.org/api.ashx?email=' + email + '&key=' + api_key + '&hash=' + hash_code
        result = Qhttp.get(url)
        if result.status_code == 200:
            if ':' in result.content.decode():
                error_code = result.content.decode().split(':')[-1]
                if error_code == '-1':
                    raise InvalidInputError(' invalid input ')
                if error_code == '-2':
                    raise InsufficientCredit('InsufficientCredit')
                if error_code == '-3':
                    raise NetworkError('server failed on cmd5.org')
                if error_code == '-4':
                    raise InvalidInputError('unknown sipher text')
                if error_code == '-7':
                    raise InvalidInputError('hash type not supported')
                if error_code == '-999':
                    raise NetworkError('some thing wrong with cmd5.org')
            try:
                return_result = {'results': result.json()}
                return return_result
            except Exception:
                ResultNotFoundError(' unknown result format ')
        else:
            raise NetworkError(result.status_code)
import time

from extensions.hrobot.v_1_0.Hrobot import Hrobot

robot = Hrobot('.', 'https://www.linkedin.com')
robot.go_to('/')
# robot.save_as_png('/home/dpe/Desktop/tttt.png')
email = robot.find_by_css("input[id='login-email']")
password = robot.find_by_css("input[id='login-password']")
log_in = robot.find_by_css("input[id='login-submit']")
email.set_value('*****@*****.**')
password.set_value('thai2$1dfrg5d@Hivai')
if log_in:
    robot.save_as_png('before_login.png')
    password.submit()
    robot.go_to('/feed/')
    #robot.wait(10000)
    robot.go_to('/feed/')
    time.sleep(10)
    print('log in')
    robot.save_as_png('login.png')

print(robot.get_url())

robot.go_to('/in/ramin-fatourehchi-39931a9a')

robot.save_as_png('tttt.png')

print(robot.find_by_contain_text('*', 'dana insurance '))
print(robot.find_by_css('#ember3655 > div:nth-child(2) > h3:nth-child(1)'))
Exemple #12
0
import tempfile
import time
import os

from PIL import Image

from components.utils import ApiLogging
from extensions.hrobot.v_1_0.Hrobot import Hrobot
from test.cmd.captcha2upload import CaptchaUpload

email = '*****@*****.**'
password = '******'
current_milli_time = lambda: int(round(time.time() * 1000))
unique_time = current_milli_time()
cookie_path = os.path.dirname(__file__)
robot = Hrobot(cookie_path,"http://cmd5.org")



def log(txt):
    f = open("result.txt", "a")
    f.write("%s\n" % txt)
    f.close()


def is_logeed_in():
    global unique_time
    unique_time = current_milli_time()
    if robot.find_by_xpath('//a[@href="exit.aspx"]') is not None:
        robot.save_cookies_to_file(robot.get_cookies())
        return True