Exemple #1
0
 def submit_url(url):
     if APP_DEBUG:
         Log.info('CALLED: Set.submit_url(' + str(url) + ')')
     if not validators.is_url(url):
         Log.error(str(url) + ' is not a valid url')
         return False
     return Set.__set__(keys.SUBMIT_URL, url)
    def request(url: str,
                request_type: str = Type.GET,
                data=None,
                json: dict or list = None,
                headers: dict = None) -> requests.Response or None:
        """
        Make a request to chosen url
        :param url: The target url
        :param request_type: get|post|put|patch|delete
        :param data: (optional) Dictionary, list of tuples, bytes, or file-like
            object to send in the body of the :class:`Request`
        :param json: (optional) json data to send in the body of the :class:`Request`
        :param headers: The headers to send
        :return The response of request, or None (if the request fail)
        """
        if headers is None:
            headers = {}
        req_headers = {'User-Agent': str(APP_NAME) + ' ' + str(APP_VERSION)}
        req_headers.update(headers)

        if data is None:
            data = {}

        request_type = request_type.lower()
        if not is_url(url):
            Log.error(str(url) + ' is not a valid url!')
            return None
        try:
            if request_type == HttpRequest.Type.GET:
                response = requests.get(url, data, headers=req_headers)
            elif request_type == HttpRequest.Type.POST:
                response = requests.post(url, data, json, headers=req_headers)
            elif request_type == HttpRequest.Type.PUT:
                response = requests.put(url, data, headers=req_headers)
            elif request_type == HttpRequest.Type.PATCH:
                response = requests.patch(url, data, headers=req_headers)
            elif request_type == HttpRequest.Type.DELETE:
                response = requests.delete(url, headers=req_headers)
            else:
                Log.error(str(request_type) + ' is not a valid request type!')
                return None
            if APP_DEBUG:
                HttpRequest.print_response(response)
            return response
        except (requests.exceptions.ConnectionError,
                requests.exceptions.TooManyRedirects) as e:
            Log.error('Unable to connect to ' + str(url))
            Log.error('Exception: ' + str(e))
        return None
 def handle_starttag(self, tag: str, attrs):
     """
     Start tag handler
     :param tag: The opened tag
     :param attrs: The attributes of opened tag like list[tuple{2}]
     """
     tag = str(tag).lower()
     if self.relevant and tag not in HtmlParser._relevant_tags.keys():
         self.queue_tag_ignored.append(tag)
         if tag in HtmlParser._not_closed_tags:
             self.handle_endtag(tag)
         return
     tag_attrs = {}
     for attr in attrs:
         attr_key = str(attr[0]).lower()
         attr_value = str(attr[1])
         if self.relevant and attr_key not in HtmlParser._relevant_tags.get(
                 tag):
             continue
         if self.base_url is not None and attr_key in HtmlParser._url_attrs and (
                 not is_url(attr_value)):
             if len(attr_value) == 0:
                 continue
             if attr_value[0] == '#':
                 continue
             if attr_value[0:2] == '//':
                 attr_value = self.url_scheme + ':' + attr_value
             else:
                 if attr_value[0] != '/' and self.base_url[
                         len(self.base_url) - 1] != '/':
                     attr_value = '/' + attr_value
                 elif attr_value[0] == '/' and self.base_url[
                         len(self.base_url) - 1] == '/':
                     attr_value = attr_value[1:]
                 attr_value = self.base_url + attr_value
         if 'email-protection' in attr_value:
             attr_value = 'email-protection'
         tag_attrs[attr_key] = attr_value
     cur_tag = {'tag': tag, 'attrs': tag_attrs}
     self.queue_tag.append(cur_tag)
     if tag in HtmlParser._not_closed_tags:
         self.handle_endtag(tag)
Exemple #4
0
    def request(url: str,
                request_type: str = Type.GET,
                data=None,
                json: dict or list = None,
                headers: dict = None,
                timeout: int = DEFAULT_TIMEOUT,
                cookies: str or dict = None) -> requests.Response or None:
        """
        Make a request to chosen url
        :param url: The target url
        :param request_type: get|post|put|patch|delete
        :param data: (optional) Dictionary, list of tuples, bytes, or file-like
            object to send in the body of the :class:`Request`
        :param json: (optional) json data to send in the body of the :class:`Request`
        :param headers: The headers to send
        :param timeout: The request timeout
        :param cookies: The request cookies
        :return The response of request, or None (if the request fail)
        """
        if headers is None:
            headers = {}
        if type(cookies) is str:
            try:
                cookies = dict(
                    (k.strip(), v.strip())
                    for k, v in (c.split('=') for c in cookies.split(';')))
            except ValueError:
                # Wrong or empty cookies
                cookies = None
                pass

        req_headers = {'User-Agent': str(APP_NAME) + ' ' + str(APP_VERSION)}
        req_headers.update(headers)

        if data is None:
            data = {}

        request_type = request_type.lower()
        if not is_url(url):
            Log.error(str(url) + ' is not a valid url!')
            return None
        try:
            if request_type == HttpRequest.Type.GET:
                response = requests.get(url,
                                        data,
                                        headers=req_headers,
                                        timeout=timeout,
                                        cookies=cookies)
            elif request_type == HttpRequest.Type.POST:
                response = requests.post(url,
                                         data,
                                         json,
                                         headers=req_headers,
                                         timeout=timeout,
                                         cookies=cookies)
            elif request_type == HttpRequest.Type.PUT:
                response = requests.put(url,
                                        data,
                                        headers=req_headers,
                                        timeout=timeout,
                                        cookies=cookies)
            elif request_type == HttpRequest.Type.PATCH:
                response = requests.patch(url,
                                          data,
                                          headers=req_headers,
                                          timeout=timeout,
                                          cookies=cookies)
            elif request_type == HttpRequest.Type.DELETE:
                response = requests.delete(url,
                                           headers=req_headers,
                                           timeout=timeout,
                                           cookies=cookies)
            else:
                Log.error(str(request_type) + ' is not a valid request type!')
                return None
            if APP_DEBUG:
                HttpRequest.print_response(response)
            return response
        except (requests.exceptions.ConnectionError,
                requests.exceptions.TooManyRedirects,
                requests.exceptions.ReadTimeout) as e:
            Log.error('Unable to complete request to ' + str(url))
            Log.error('Exception: ' + str(e))
        return None
Exemple #5
0
 def handle_starttag(self, tag: str, attrs):
     """
     Start tag handler
     :param tag: The opened tag
     :param attrs: The attributes of opened tag like list[tuple{2}]
     """
     tag = str(tag).lower()
     if (not self.relevant) or tag in HtmlParser._relevant_tags.keys():
         tag_attrs = {}
         for attr in attrs:
             attr_key = str(attr[0]).lower()
             attr_value = str(attr[1])
             if (not self.relevant) or attr_key in HtmlParser._relevant_tags.get(tag):
                 if self.base_url is not None and attr_key in HtmlParser._url_attrs and (not is_url(attr_value)):
                     if attr_value[0:2] == '//':
                         attr_value = self.url_scheme + ':' + attr_value
                     else:
                         if attr_value[0:1] != '/':
                             attr_value = '/' + attr_value
                         attr_value = self.base_url + attr_value
                 tag_attrs[attr_key] = attr_value
         cur_tag = {'tag': tag, 'attrs': tag_attrs}
         self.queue_tag.append(cur_tag)
         if tag in HtmlParser._not_closed_tags:
             self.handle_endtag(tag)
     else:
         self.queue_tag_ignored.append(tag)
    def crawl(url: str,
              parsing_type: str,
              callback,
              depth: int = 0,
              cookies: str = None):
        """
        :param url: The url to crawl/parse
        :param parsing_type: HtmlParse.TYPE_ALL | HtmlParse.TYPE_RELEVANT | HtmlParse.TYPE_FORM | HtmlParse.TYPE_META
        :param callback: The callback method to call foreach visited page
        :param depth: The max crawling depth (0 to execute a normal page parsing, < 0 for no limit)
        :param cookies: The cookies to use on parsing
        """
        if not is_url(url):
            raise ValueError('url must be a valid url')
        if parsing_type not in HtmlParser.types():
            raise ValueError('parsing_type must be one of ' +
                             str(HtmlParser.types()))
        if not callable(callback):
            raise ValueError('callback is not callable')
        if type(depth) is not int:
            raise ValueError('dept must be an integer')
        if cookies is not None and type(cookies) is not str:
            raise ValueError('cookies must be a string')

        base_url = urlparse(url).netloc
        base_urls = (base_url, )
        if base_url[0:4] != 'www.':
            base_urls += ('www.' + str(base_url), )
        parsed_urls = set()
        parsed_hashes = set()

        def _crawl(href: str, curr_depth: int = 0):
            if href in parsed_urls or \
                    urlparse(href).netloc not in base_urls or \
                    (0 <= depth and (depth < curr_depth)):
                return

            # Visit the current href
            if parsing_type == HtmlParser.TYPE_ALL:
                parsed, _ = HtmlParser.all_parse(href, cookies=cookies)
            else:
                parsed, _ = HtmlParser.relevant_parse(href, cookies=cookies)

            parsed_hash = hash(JsonSerializer.dump_json(parsed))
            if parsed_hash in parsed_hashes:
                return

            parsed_hashes.add(parsed_hash)
            parsed_urls.add(href)

            if parsing_type == HtmlParser.TYPE_FORM:
                # Find forms in page
                parsed_page = HtmlParser.find_forms(parsed, href)
            elif parsing_type == HtmlParser.TYPE_META:
                # Find metadata in page
                parsed_page = HtmlParser.find_meta(parsed)
            else:
                parsed_page = parsed

            if parsed_page.get('tag') is not None:
                parsed_page = {0: parsed_page}

            parsed_page['url'] = href
            callback(parsed_page)

            # Find adjacent links
            links = HtmlParser.find_links(parsed)
            for link in links:
                _crawl(link, curr_depth + 1)

        _crawl(url)
        Log.success(url + ' crawling done!')