Python validate_url Examples, validators.validate_url Python Examples

Example #1

0

Show file

File: wiki_crawl.py Project: mhamedLmarbouh/wikipedia-crawler

def urls(start, target):
    if validate_url(start) and validate_url(
            target) and 'wikipedia' in start and 'wikipedia' in target:
        start_url = parse.urlparse(start).path[1:]
        target_url = parse.urlparse(target).path[1:]
        return start_url, target_url
    else:
        print('invalide urls')
        exit()

Example #2

0

Show file

File: handlers.py Project: spmishra5555/stomach-server

    def post(self):
        self.check_xsrf_cookie()

        upstream_server = self.get_argument("upstream_server", "")
        password = self.get_argument("password", "")
        http_username = self.get_argument("http_username", "")
        http_password = self.get_argument("http_password", "")

        if upstream_server and not validate_url(upstream_server):
            self.set_flash_message("error", "Url is not valid.")
            self.redirect("/__manage")
        else:
            self.api_data.upstream_server = upstream_server
            self.api_data.http_username = http_username

            if password:
                password_hash = yield self.application.pool.submit(
                    bcrypt.hashpw, password, bcrypt.gensalt())

                self.api_data.password = password_hash

            if not http_username:
                self.api_data.http_password = ""
            elif http_password:
                http_password_hash = yield self.application.pool.submit(
                    crypt, http_password, gencryptsalt())

                self.api_data.http_password = http_password_hash

            self.api_data.save()

            self.set_flash_message("success",
                                   "Settings has been successfully saved.")
            self.redirect("/__manage/settings")

Example #3

0

Show file

    def validate(url: str) -> bool:
        """
		Performs a check if the string provided is url or not.
		"""
        validation: Union[bool, ValidationFailure] = validate_url(url)
        if type(validation) is bool:
            return validation

        return False

Example #4

0

Show file

def is_url(url):

    # regex could be unprecise
    #regex = re.compile(
    #    r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))")

    if validate_url(url):
        return True

    return False

Example #5

0

Show file

File: views.py Project: inna-em/shortener_service

def index_view(request):
    if request.method == 'POST':
        long_url = request.POST.get('original_url', '')
        #    Для избежания дублирования в БД используется get_or_create
        if not validate_url(long_url):
            return HttpResponse("Not valid url is got",
                                status=status.HTTP_400_BAD_REQUEST)
        url = UrlModel.objects.get_or_create(original_url=long_url)
        short_url = SITE_NAME + encode(url[0].id)
        return HttpResponse(short_url, status=status.HTTP_200_OK)

Example #6

0

Show file

def get_aws_function(text):
    """
    Reqests open API to get url to AWS function that produces audio
    :param text: text to speech
    """
    try:
        response = get(
            "https://nextup.com/ivona/php/nextup-polly"
            "/CreateSpeech/CreateSpeechGet3.php",
            allow_redirects=True,
            params={
                "voice": "Maxim",
                "language": "ru-RU",
                "text": text
            })
        response.raise_for_status()
        validate_url(response.text)
        return response.text
    except Exception as e:
        raise ApiRequestError("API call error!", e)

Example #7

0

Show file

File: projects.py Project: domeniko-gentner/labertasche

def validate_project(project, is_edit=False):
    """
    Validates important bits of a project database entry

    :param project: The json from the request, containing the data for a project.
    :param is_edit: If we are updating the database, we need to know, so we don't check for dupes on urls.
    :return: A response with the error or None if the project is valid.
    """
    # Validate length
    if not len(project['name']) and \
            not len(project['blogurl']) and \
            not len(project['output']):
        return make_response(jsonify(status='too-short'), 400)

    # Validate project name
    if not re.match('^\\w+$', project['name']):
        return make_response(jsonify(status='invalid-project-name'), 400)

    # Check if project name already exists
    name_check = db.session.query(
        TProjects.name).filter(TProjects.name == project['name']).first()
    if name_check and not is_edit:
        return make_response(jsonify(status='project-exists'), 400)

    # Validate existing only if we are not editing
    url_exists = False
    if not is_edit:
        url_exists = db.session.query(TProjects.blogurl).filter(
            TProjects.blogurl == project['blogurl']).first()

    if not validate_url(project['blogurl']) or url_exists:
        return make_response(jsonify(status='invalid-blog-url'), 400)

    # Validate output path
    output = Path(project['output']).absolute()
    # The second check is needed, since javascript is passing an empty string instead of
    # null. For some reason, this makes SQLAlchemy accept the data and commit it to the db
    # without exception. This check prevents this issue from happening.
    if not output.exists() or len(project['output'].strip()) == 0:
        return make_response(jsonify(status='invalid-path-output'), 400)

    # Validate cache path, if caching is enabled
    if project['gravatar_cache']:
        cache = Path(project['gravatar_cache_dir']).absolute()
        if not cache.exists() or len(
                project['gravatar_cache_dir'].strip()) == 0:
            return make_response(jsonify(status='invalid-path-cache'), 400)

    return None

Example #8

0

Show file

def find_extern_links(content, domain, suffix, url):
    found = False
    number_extern_links = 0

    try:
        soup = bs4.BeautifulSoup(content.lower(), 'lxml')

    except Exception as e:
        log(action_logging_enum=ERROR, logging_text="Couldn't load content of website for external content check.")

    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue

        try:
            if not validate_url(href):
                if validate_url(urljoin(url, href)):
                    href = urljoin(url, href)
                else:
                    continue

            components = get_url_components(href)

            extracted_domain = components[3]
            extracted_suffix = components[4]

            if extracted_domain.__ne__(domain) or extracted_suffix.__ne__(suffix):
                number_extern_links += 1
                found = True

        except Exception as e:
            continue

    return found, number_extern_links

Example #9

0

Show file

 def parseLinks(self, url):
     # Appends to CRAWLQUEUE the valid links on the page that URL links to.
     try:
         links = BeautifulSoup(get(url).text, 'lxml').findAll("a",
                                                              href=True)
         for l in links:
             if l['href'].split('/')[0] == '':
                 # Prepends host to path to make internal links valid.
                 link = ''.join([url, l['href']])
             else:
                 link = l['href']
             link = link.rsplit('?',
                                1)[0]  # Removes query strings from URL.
             if validate_url(
                     link
             ) and link not in self.crawlQueue and link not in self.visited:
                 self.crawlQueue.append(link)
     except:
         pass

Example #10

0

Show file

def add_link(url, url_type, login, custom_short_url=None):
    ''' adds link in database'''
    if url_type not in url_types:
        url_type = 'public'
    if url[:4] != 'http':
        url = 'http://' + url
    if not validate_url(url):
        return jsonify({"msg": "Bad url"}), 400
    try:
        last_id = db.request_insert_three('Urls', 'url, url_type, user_id',
                                          url, url_type, login)
        hashed = getHash(last_id[0][0])
        db.request_update('Urls', 'short_url', hashed, 'id', last_id[0][0])
        if custom_short_url is not None:
            db.request_update('Urls', 'custom_short_url', custom_short_url,
                              'id', last_id[0][0])
    except sqlite3.IntegrityError:
        custom_short_url = None
    # Переделать под возврат кор. ссылки в джcоне jsonify
    return jsonify(short_url=hashed, custom_short_url=custom_short_url), 200

Example #11

0

Show file

    def do_content_search(content):
        soup = bs4.BeautifulSoup(content.lower(), 'html.parser')
        if soup.find('a', href=True):
            for link in soup.find_all('a', href=True):
                if any(link.text.__eq__(token.lower()) for token in login_token_list):
                    login_url = str(link['href']).strip()

                    if login_url.startswith('/') and url.endswith('/'):
                        login_url = url + login_url.replace('/', '', 1)

                    if login_url.startswith('./'):
                        if url.endswith('/'):
                            login_url = url + login_url.replace('./', '', 1)
                        else:
                            login_url = url + login_url.replace('.', '', 1)

                    if validate_url(login_url):
                        log(action_logging_enum=INFO,
                            logging_text="Login page found by content analysis.")
                        return login_url, True

        return url, False

Example #12

0

Show file

File: handlers.py Project: BlinkTunnel/stomach-server

    def post(self):
        self.check_xsrf_cookie()

        upstream_server = self.get_argument("upstream_server", "")
        password = self.get_argument("password", "")
        http_username = self.get_argument("http_username", "")
        http_password = self.get_argument("http_password", "")

        if upstream_server and not validate_url(upstream_server):
            self.set_flash_message(
                "error",
                "Url is not valid.")
            self.redirect("/__manage")
        else:
            self.api_data.upstream_server = upstream_server
            self.api_data.http_username = http_username

            if password:
                password_hash = yield self.application.pool.submit(
                    bcrypt.hashpw, password, bcrypt.gensalt())

                self.api_data.password = password_hash

            if not http_username:
                self.api_data.http_password = ""
            elif http_password:
                http_password_hash = yield self.application.pool.submit(
                    crypt, http_password, gencryptsalt())

                self.api_data.http_password = http_password_hash

            self.api_data.save()

            self.set_flash_message(
                "success",
                "Settings has been successfully saved.")
            self.redirect("/__manage/settings")

Example #13

0

Show file

File: server.py Project: carolynjonesconway/oauthredirect

def do_redirect():
    url = request.args.get('url', '')
    if not validate_url(url):
        return render_template('invalid-url.html', url=url)
    return redirect(url)

Example #14

0

Show file

def test_validate_url_nok_malformed_http():
    assert validators.validate_url('http/:/asdf.com/') is False

Example #15

0

Show file

def test_validate_url_nok_dot_trailing_slash():
    assert validators.validate_url('http://../') is False

Example #16

0

Show file

def extract_features_from_website(url, label, predict):
    """
        extract all features from website, if predict set to true a pandas dataframe is created
    """

    try:
        global brand_list
        global phishy_list
        global login_list
        global tld_list
        # save original url for object instance
        url_orig = url

        # get different components of url
        components = get_url_components(url)

        fqdn = components[0]
        scheme = components[1]
        subdomain = components[2]
        domain = components[3]
        suffix = components[4]
        port = components[5]
        path = components[6]
        query = components[7]
        fragment = components[8]

        netloc = fqdn
        url_no_prot = url

        if scheme:
            netloc = scheme + "://" + fqdn

            if port:
                netloc = netloc + ":" + port

            url_no_prot = url.replace(scheme + "://", "", 1)

        # check for redirects of url
        resp_url, num_redirects, protocol, content = get_redirects(url)

        # try again if no connection could have been established
        if content == -1:
            time.sleep(3)
            resp_url, num_redirects, protocol, content = get_redirects(url)

            if content == -1:
                return None

        # get content for homepage
        hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects(
            "{}://www.{}.{}".format(scheme, domain, suffix))

        if hp_content == -1:
            time.sleep(3)
            hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects(
                "{}://www.{}.{}".format(scheme, domain, suffix))

        # read content in parser
        if not hp_content == -1:
            hp_soup = bs4.BeautifulSoup(hp_content.lower(), 'html.parser')

        soup = bs4.BeautifulSoup(content.lower(), 'html.parser')

        url = resp_url

        # number of redirects done by website
        if num_redirects > 0:
            bool_redirect_website = True
        else:
            bool_redirect_website = False

        # website has favicon/ check if website has favicon
        bool_favicon_website = False

        try:
            icon = favicon.get(url, timeout=3)
            bool_favicon_website = True
        except Exception as e:
            bool_favicon_website = False

        # website has links pointing to extern content
        bool_content_extern_website = False

        # number of links pointing to extern content
        int_links_extern_website = 0
        bool_content_extern_website, int_links_extern_website = find_extern_links(
            content.lower(), domain, suffix, url)

        # check for custom status bar
        bool_custom_statusbar_website = bool(
            str(content).lower().replace(" ",
                                         "").__contains__("window.status="))

        # custom right click
        bool_disable_rightclick_website = False

        if str(content).replace(
                " ", "").lower().__contains__("document.oncontextmenu="):
            bool_disable_rightclick_website = True

        res = soup.findAll("body")

        if res:
            for element in res:
                try:
                    right_click_arg = element['oncontextmenu']
                    if str(right_click_arg) == "return false":
                        bool_disable_right_click = True
                except Exception as e:
                    continue

        # has pop up window
        bool_popup_website = False
        hidden_count = 0
        res = soup.findAll("div")

        if res:
            for tag in res:
                try:
                    arg = tag['class']
                    if "popup" in arg:
                        bool_popup_website = True
                except Exception as e:
                    pass
                try:
                    arg = tag['style']
                    arg = str(arg).replace(" ", "")

                    if arg.__contains__("display:none") or arg.__contains__(
                            "visibility:hidden"):
                        hidden_count += 1
                except Exception as e:
                    continue

        # has iframe
        bool_iframe_website = False
        res = soup.findAll("iframe")
        if res:
            bool_iframe_website = True

        # has action tag > custom 2. feature - is action extern?
        bool_action_website = False
        bool_action_extern_website = False

        # has bool form post
        bool_form_post_website = False

        res = soup.findAll("form")

        if res:
            for element in res:
                try:
                    if element["action"]:
                        bool_action_website = True
                        action_url = element["action"]

                        if validate_url(action_url) or validate_url(
                                urljoin(netloc, action_url)):

                            if validate_url(urljoin(netloc, action_url)):
                                action_url = urljoin(netloc, action_url)

                            extracted_action_url = get_url_components(
                                action_url)

                            domain_action_url = extracted_action_url[3]
                            suffix_action_url = extracted_action_url[4]

                            if not suffix == suffix_action_url or not domain == domain_action_url:
                                bool_action_extern_website = True
                                break

                    if element["method"] == "post":
                        bool_form_post_website = True
                except Exception as e:
                    continue

        # has phishy tokens in visible content
        int_phishy_tokens_website = 0

        for text in soup.stripped_strings:
            int_phishy_tokens_website += sum(1 for word in phishy_list
                                             if text.__contains__(word))

        # has input tag
        bool_input_website = False
        if get_element_count("input", soup) > 0: bool_input_website = True

        # find meta description
        res = soup.find('meta', attrs={'name': 'og:description'})
        if not res:
            res = soup.find('meta', attrs={'property': 'description'})
        if not res:
            res = soup.find('meta', attrs={'name': 'description'})

        if not hp_content == -1:
            hp_res = hp_soup.find('meta', attrs={'name': 'og:description'})
            if not hp_res:
                hp_res = hp_soup.find('meta',
                                      attrs={'property': 'description'})
            if not hp_res:
                hp_res = hp_soup.find('meta', attrs={'name': 'description'})

        float_description_sim_website = 0

        if hp_content == -1:
            float_description_sim_website = -1

        if not hp_content == -1:
            if res and hp_res:
                try:
                    hp_desc = hp_res['content']
                    desc = res['content']

                    # compute similarity of description from home and login page
                    float_description_sim_website = string_similarity(
                        desc, hp_desc)
                except Exception:
                    pass

        # bond status login and homepage
        bool_bond_status_website = False

        # most frequent domain ist extern > tru/false
        bool_freq_domain_extern_website = False
        res = soup.findAll("a")
        domain_list = []
        link_list = []
        href_count = 0
        redirect_object_list = []

        if res:
            for a_tag in res:
                try:
                    href = a_tag.attrs.get("href")

                    href_count += 1

                    if validate_url(href) or validate_url(urljoin(
                            netloc, href)):

                        if validate_url(urljoin(netloc, href)):
                            href = urljoin(netloc, href)

                        if href == hp_url:
                            bool_bond_status_website = True

                        components_href = get_url_components(href)

                        domain_href = components_href[3]
                        suffix_href = components_href[4]

                        if is_IP(domain):
                            continue
                        link_list.append(href)
                        domain_list.append("{},{}".format(
                            domain_href, suffix_href))

                except Exception as e:
                    continue

            link_list = list(set(link_list))
            link_list = link_list[:10]
            if not hp_content == -1:
                try:
                    redirect_object_list = get_redirects_list(link_list)

                except Exception as e:
                    log(action_logging_enum=ERROR, logging_text=str(e))

                if redirect_object_list:
                    for redirect_object in redirect_object_list:

                        if not bool_bond_status_website and not hp_content == -1 and redirect_object_list:
                            try:
                                website_sim = html_similarity.similarity(
                                    str(hp_content).lower(),
                                    str(redirect_object.content).lower(),
                                    k=0.3)

                                if website_sim == 1:
                                    bool_bond_status_website = True
                            except Exception:
                                continue

        if domain_list:
            occure_count = Counter(domain_list)
            most_freq = occure_count.most_common(1)[0][0]
            most_frq_domain, most_freq_suffix = most_freq.split(",", 1)

            if not str(most_frq_domain) == domain or not str(
                    suffix) == most_freq_suffix:
                bool_freq_domain_extern_website = True

        # jaccard similarity between homepage and login page
        float_login_home_website = 0
        if not hp_content == -1:
            try:
                float_login_home_website = html_similarity.similarity(
                    str(content).lower(), str(hp_content).lower(), k=0.3)
            except Exception:
                pass
        # website has copyright
        bool_copyright_website = False

        # similarity from copyright of login page and home page
        copy = ""
        hp_copy = ""
        if not hp_content == -1:
            float_copyright_sim_website = 0
            for text in soup.stripped_strings:
                if '©' in text:
                    copy = re.sub(r'\s+', ' ', text)
                    bool_copyright_website = True

            for text in hp_soup.stripped_strings:
                if '©' in text:
                    hp_copy = re.sub(r'\s+', ' ', text)

            if copy and hp_copy:
                float_copyright_sim_website = string_similarity(copy, hp_copy)
        else:
            float_copyright_sim_website = 0

        # similarity from title of login page and home page
        float_title_sim_website = 0
        if not hp_content == -1:
            try:
                title = soup.title.text
                hp_title = hp_soup.title.text
                float_title_sim_website = string_similarity(title, hp_title)
            except Exception:
                float_title_sim_website = 0
                pass

        # unique links/all links on page
        float_unique_links_website = 0
        if link_list:
            float_unique_links_website = len(list(
                set(link_list))) / len(link_list)

        # lexical analysis for all links on website
        bool_link_analysis_website = True
        # dataframe = pd.DataFrame()
        # try:
        # redirect_object = RedirectEntry(url=url, redirects=num_redirects, content=content, protocol=protocol)
        # dataframe = pd.DataFrame(extract_features_from_URL(redirect_object, "Predict", brand_list=brand_list,
        # tld_list=tld_list, phishy_list=phishy_list, predict=True))
        # except Exception as e:
        # pass

        # if not dataframe.empty:
        # try:
        # df = pd.DataFrame(dataframe.iloc[0]).transpose()
        # prediction = predict_url(df)

        # if int(prediction) == 0:
        # bool_link_analysis_website = False
        # except Exception:
        # pass

        # number of input elements
        int_input_website = 0

        # find form accompanied by labels with loginwords
        bool_input_login_website = False
        form = soup.find("form")
        try:
            if form:
                inputs = form.find_all("input")

                if inputs:

                    int_input_website = len(inputs)

                    for inp in inputs:
                        try:
                            if inp["type"] == "hidden":
                                hidden_count += 1
                        except Exception:
                            continue

                    label_tags = form.findAll("label")

                    if label_tags:
                        for label_entry in label_tags:
                            if any(
                                    str(label_entry.text).__contains__(word)
                                    for word in login_list):
                                bool_input_login_website = True

        except Exception:
            pass

        # website has button
        bool_button_website = False
        button_count = get_element_count("button", soup)
        if button_count > 0:
            bool_button_website = True

        # website has meta information
        bool_meta_website = False

        if soup.find("meta"):
            bool_meta_website = True

        # has hidden elements
        bool_hidden_element_website = False
        if hidden_count > 0:
            bool_hidden_element_website = True

        # number of option tags
        int_option_website = get_element_count("option", soup)
        int_option_website = get_element_count("option", soup)

        # number select tags
        int_select_website = get_element_count("select", soup)

        # number th tags
        int_th_website = get_element_count("th", soup)

        # number of tr tags
        int_tr_website = get_element_count("tr", soup)

        # number of table tags
        int_table_website = get_element_count("table", soup)

        # number of href in a tag
        int_href_website = href_count

        # number of list item tags
        int_li_website = get_element_count("li", soup)

        # number of unordered list tags
        int_ul_website = get_element_count("ul", soup)

        # number of ordered list tags
        int_ol_website = get_element_count("ol", soup)

        # number of div tags
        int_div_website = get_element_count("div", soup)

        # number of span tags
        int_span_website = get_element_count("span", soup)

        # number of article tags
        int_article_website = get_element_count("article", soup)

        # number of p tags
        int_p_website = get_element_count("p", soup)

        # number of checkbox tags
        int_checkbox_website = get_element_count("input", soup, "type",
                                                 "checkbox")

        # number of buttons
        int_button_website = button_count

        # number of images
        int_image_website = get_element_count("img", soup)

        if predict == False:
            entry = FeatureEntryContent(
                bool_redirect_website=bool_redirect_website,
                bool_favicon_website=bool_favicon_website,
                bool_content_extern_website=bool_content_extern_website,
                int_links_extern_website=int_links_extern_website,
                bool_custom_statusbar_website=bool_custom_statusbar_website,
                bool_disable_rightclick_website=bool_disable_rightclick_website,
                bool_popup_website=bool_popup_website,
                bool_iframe_website=bool_iframe_website,
                bool_action_website=bool_action_website,
                bool_action_extern_website=bool_action_extern_website,
                bool_form_post_website=bool_form_post_website,
                int_phishy_tokens_website=int_phishy_tokens_website,
                bool_input_website=bool_input_website,
                float_description_sim_website=float_description_sim_website,
                bool_bond_status_website=bool_bond_status_website,
                bool_freq_domain_extern_website=bool_freq_domain_extern_website,
                float_login_home_website=float_login_home_website,
                bool_copyright_website=bool_copyright_website,
                float_copyright_sim_website=float_copyright_sim_website,
                float_title_sim_website=float_title_sim_website,
                float_unique_links_website=float_unique_links_website,
                # bool_link_analysis_website=bool_link_analysis_website,
                int_input_website=int_input_website,
                bool_input_login_website=bool_input_login_website,
                bool_button_website=bool_button_website,
                bool_meta_website=bool_meta_website,
                bool_hidden_element_website=bool_hidden_element_website,
                int_option_website=int_option_website,
                int_select_website=int_select_website,
                int_th_website=int_th_website,
                int_tr_website=int_tr_website,
                int_table_website=int_table_website,
                int_href_website=int_href_website,
                int_li_website=int_li_website,
                int_ul_website=int_ul_website,
                int_ol_website=int_ol_website,
                int_div_website=int_div_website,
                int_span_website=int_span_website,
                int_article_website=int_article_website,
                int_p_website=int_p_website,
                int_checkbox_website=int_checkbox_website,
                int_button_website=int_button_website,
                int_image_website=int_image_website,
                label=label,
                url=url_orig,
                final_url=url)

            log(action_logging_enum=INFO,
                logging_text="Processed datapoint. {}".format(url))

            return entry

        elif predict:
            data = {
                "ID": [0],
                "Has Redirect": [bool_redirect_website],
                "Has Favicon": [bool_favicon_website],
                "Has Extern Content": [bool_content_extern_website],
                "Number Extern Links": [int_links_extern_website],
                "Has Custom StatusBar": [bool_custom_statusbar_website],
                "Has Disabled RightClick": [bool_disable_rightclick_website],
                "Has PopUp": [bool_popup_website],
                "Has iFrame": [bool_iframe_website],
                "Has Action": [bool_action_website],
                "Has Extern Action": [bool_action_extern_website],
                "Has Form with POST": [bool_form_post_website],
                "Number PhishyTokens": [int_phishy_tokens_website],
                "Has Input": [bool_input_website],
                "Ratio Description Sim": [float_description_sim_website],
                "Has Bond Status": [bool_bond_status_website],
                "Has Freq Domain Extern": [bool_freq_domain_extern_website],
                "Ratio Similarity": [float_login_home_website],
                "Has Copyright": [bool_copyright_website],
                "Ratio Copyright Sim": [float_copyright_sim_website],
                "Ratio Title Sim": [float_title_sim_website],
                "Ratio Unique Links": [float_unique_links_website],
                "Number Inputs": [int_input_website],
                "Has Input for Login": [bool_input_login_website],
                "Has Button": [bool_button_website],
                "Has Meta": [bool_meta_website],
                "Has Hidden Element": [bool_hidden_element_website],
                "Number Option": [int_option_website],
                "Number Select": [int_select_website],
                "Number TH": [int_th_website],
                "Number TR": [int_tr_website],
                "Number Table": [int_table_website],
                "Number HREF": [int_href_website],
                "Number LI": [int_li_website],
                "Number UL": [int_ul_website],
                "Number OL": [int_ol_website],
                "Number DIV": [int_div_website],
                "Number Span": [int_span_website],
                "Number Article": [int_article_website],
                "Number Paragr": [int_p_website],
                "Number Checkbox": [int_checkbox_website],
                "Number Button": [int_checkbox_website],
                "Number Image": [int_image_website],
                "Label": [label],
                "URL": [url_orig],
                "Final URL": [url]
            }

            columns = list(CONTENT_FEATURE_LIST_COLUMN_NAMES)

            df = pd.DataFrame(data, columns=columns)

            return df

    except Exception as e:
        log(action_logging_enum=WARNING, logging_text=str(e))
        log(action_logging_enum=WARNING, logging_text=str(e.__traceback__))
        exc_type, exc_obj, tb = sys.exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        log(
            ERROR, 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(
                filename, lineno, line.strip(), exc_obj))
        log(action_logging_enum=WARNING,
            logging_text="Could not extract content features for {}".format(
                url))

    log(action_logging_enum=INFO,
        logging_text="Failed datapoint. {}".format(url))
    return None

Example #17

0

Show file

def test_validate_url_nok_double_question():
    assert validators.validate_url('http://??') is False

Example #18

0

Show file

def test_validate_url_nok_space():
    assert validators.validate_url(
        'http://foo.bar?q=Spaces should be encoded') is False

Example #19

0

Show file

                url = argv.pop(i + 1)
            except:
                pass

        if arg == "-l":
            try:
                limit = int(argv.pop(i + 1))
            except ValueError:
                print(
                    "Error: the LIMIT amount you provided wasn't an integer.")
                exit()

        if arg == "-o":
            try:
                out_file = argv.pop(i + 1)
            except:
                pass

    # If URL is valid, begin crawling.
    if url:
        if validate_url(url):
            WebCrawler(url, limit, out_file)
        else:
            print(
                "Error: the URL you provided is not valid (make sure you include 'http://' or 'https://')."
            )
    else:
        print(
            "Error: you must provide a valid URL (string) following the '-u' flag."
        )

Example #20

0

Show file

def test_validate_url_nok_double_slash_a():
    assert validators.validate_url('//a') is False

Example #21

0

Show file

def test_validate_url_nok_malformed_https():
    assert validators.validate_url('https/:/fdsa.com') is False

Example #22

0

Show file

def test_validate_url_nok_rdar():
    assert validators.validate_url('rdar://1234') is False

Example #23

0

Show file

def test_validate_url_nok_domain_only():
    assert validators.validate_url('foo.com') is False

Example #24

0

Show file

def test_validate_url_nok_proto_triple_slash_a():
    assert validators.validate_url('http:///a') is False

Example #25

0

Show file

def test_validate_url_nok_triple_slash():
    assert validators.validate_url('///') is False

Example #26

0

Show file

    def get_files(self, arguments, key):
        list_target_paths = list()
        list_final_target_paths = list()
        str_kustomization_path = str()

        try:
            # test if the key to configure is even defined in input helmizer config
            list_kustomization_children = self.helmizer_config["kustomize"][
                key].get(list)
            str_kustomization_directory = str()
            try:
                str_kustomization_directory = self.helmizer_config["helmizer"][
                    "kustomization-directory"].get(str)
            except NotFoundError:
                str_kustomization_directory = "."

            str_kustomization_path = path.dirname(
                path.abspath(
                    path.normpath(
                        path.join(
                            arguments.helmizer_config,
                            str_kustomization_directory,
                        ))))

            if len(list_kustomization_children) > 0:
                for target_path in list_kustomization_children:
                    str_child_path = path.abspath(
                        path.join(str_kustomization_path, target_path))

                    # walk directory
                    if path.isdir(str_child_path):
                        for (dirpath, _, filenames) in walk(str_child_path):
                            for filename in filenames:
                                list_target_paths.append(
                                    path.join(dirpath, filename))

                    # file
                    elif path.isfile(str_child_path):
                        list_target_paths.append(str_child_path)

                    # url
                    elif validate_url(str_child_path):
                        list_target_paths.append(str_child_path)

                # convert absolute paths into paths relative to the kustomization directory
                for final_target_path in list_target_paths:
                    list_final_target_paths.append(
                        path.relpath(final_target_path,
                                     str_kustomization_path))

                # remove any ignored files
                try:
                    for ignore in self.helmizer_config["helmizer"][
                            "ignore"].get(list):
                        logging.debug(
                            f"Removing ignored file from final list: {ignore}")
                        list_final_target_paths.remove(ignore)
                except ValueError:
                    pass
                except NotFoundError:
                    pass

                return list_final_target_paths

        except NotFoundError:
            logging.debug(f"key not found: {key}")
            pass
        except KeyError:
            logging.debug(f"key not found: {key}")
            pass
        except TypeError:
            pass

Example #27

0

Show file

    def get_files(self: object, arguments: object, key: str) -> list:
        target_paths: list = list()
        final_target_paths: list = list()
        kustomization_path: str = str()

        try:
            # test if the key to configure is even defined in input helmizer config
            kustomization_children: list = self.helmizer_config["kustomize"][
                key].get(list)
            kustomization_directory: str = str()
            try:
                kustomization_directory: str = self.helmizer_config[
                    "helmizer"]["kustomization-directory"].get(str)
            except NotFoundError:
                kustomization_directory: str = "."

            kustomization_path = path.dirname(
                path.abspath(
                    path.normpath(
                        path.join(
                            arguments.helmizer_config,
                            kustomization_directory,
                        ))))

            if len(kustomization_children) > 0:
                # each target path
                for target_path in kustomization_children:
                    child_path: str = path.abspath(
                        path.join(kustomization_path, target_path))

                    # walk directory
                    if path.isdir(child_path):
                        for (dirpath, _, filenames) in walk(child_path):
                            for filename in filenames:
                                target_paths.append(
                                    path.join(dirpath, filename))

                    # file
                    elif path.isfile(child_path):
                        target_paths.append(child_path)

                    # url
                    elif validate_url(child_path):
                        target_paths.append(child_path)

                # remove any ignored files
                try:
                    # walk directory to remove multiple files
                    for ignore in self.helmizer_config["helmizer"][
                            "ignore"].get(list):
                        ignore_abspath: str = path.abspath(
                            path.join(kustomization_path, ignore))
                        if path.isdir(ignore_abspath):
                            for (dirpath, _,
                                 filenames) in walk(ignore_abspath):
                                for filename in filenames:
                                    file_path: str = path.join(
                                        dirpath, filename)
                                    logging.debug(
                                        f"Removing ignored file from final list: {file_path}"
                                    )
                                    target_paths.remove(file_path)
                        # remove a file
                        else:
                            logging.debug(
                                f"Removing ignored file from final list: {path.join(kustomization_path, ignore)}"
                            )
                            target_paths.remove(
                                path.join(kustomization_path,
                                          ignore))  # just one file
                except ValueError:
                    pass
                except NotFoundError:
                    pass

                # convert absolute paths into paths relative to the kustomization directory
                for final_target_path in target_paths:
                    final_target_paths.append(
                        path.relpath(final_target_path, kustomization_path))

                return final_target_paths

        except NotFoundError or KeyError or TypeError:
            logging.debug(f"key not found: {key}")
            return final_target_paths

Example #28

0

Show file

 def url(self, value):
     validate_url(value)
     self._url = value

Example #29

0

Show file

def test_validate_url_nok_double_question_trailing_slash():
    assert validators.validate_url('http://??/') is False

Example #30

0

Show file

def test_validate_url_nok_h():
    assert validators.validate_url('h://test') is False

Example #31

0

Show file

def test_validate_url_nok_double_hash():
    assert validators.validate_url('http://##') is False