Python similarityの例、html_similarity.similarity Pythonの例

コード例 #1

0

ファイルを表示

def Checking_IP(domain):
	A = Checking_DNS(ns, dns)
	print (Fore.GREEN + '   [+]' + Fore.RESET) + ' Possible IP:', A
	url = 'http://' + domain
	print((Fore.BLUE + '[*]') + Fore.RESET + ' Retrieving target homepage at: %s' % url)
	try:
		org_response = requests.get(url, timeout=config['http_timeout_seconds'])
	except requests.exceptions.Timeout:
		sys.stderr.write((Fore.RED + "   [-]" + Fore.RESET) + " %s timed out after %d seconds.\n" % (url, config['http_timeout_seconds']))
		exit(1)
	except requests.exceptions.RequestException as e:
		sys.stderr.write((Fore.RED + "   [-]" + Fore.RESET) + " Failed to retrieve %s.\n" % url)
		exit(1)

	if org_response.status_code != 200:
		print (Fore.RED + '   [-]' + Fore.RESET) + ' %s responded with an unexpected HTTP status code %d' % (url, org_response.status_code)
		exit(1)

	if org_response.url != url:
		print ((Fore.GREEN + '[+]' + Fore.RESET) + ' %s redirects to %s.' % (url, org_response.url))
		print (Fore.GREEN + "   [+]" + Fore.RESET + " Request redirected successful to %s." % org_response.url)
	print ((Fore.BLUE + '[*]' + Fore.RESET + ' Testing if body content is the same in both websites.'))

	sec_response = requests.get('http://' + str(A), timeout=config['http_timeout_seconds'])
	if sec_response.status_code != 200:
		print (Fore.RED + '   [-]' + Fore.RESET) + ' %s responded with an unexpected HTTP status code %d' % (url, org_response.status_code)
		exit(1)
	if sec_response.text == org_response.text:
		print ((str(A), 'HTML content identical to %s' % domain))
	page_similarity = similarity(sec_response.text, org_response.text)
	if page_similarity > config['response_similarity_threshold']:
		print (((Fore.GREEN + '   [+]' + Fore.RESET) + ' HTML content is %d%% structurally similar to: %s' % (round(100 *page_similarity, 2), org_response.url)))
	return org_response

コード例 #2

0

ファイルを表示

def html_comparison(goodurl, badurl):
    if not goodurl.startswith("http://"):
        goodurl = "http://" + goodurl

    if not badurl.startswith("http://"):
        badurl = "http://" + badurl

    goodpage_html = html_cache.get(goodurl)
    badpage_html = html_cache.get(badurl)
    if not goodpage_html:
        try:
            r = requests.get(goodurl)
            html_cache[goodurl] = r.text
        except Exception as error:
            return (False, error)

    if not badpage_html:
        try:
            r = requests.get(badurl)
            html_cache[badurl] = r.text
        except Exception as error:
            return (False, error)

    #try:
    print("Testing {} {}".format(goodpage_html, badpage_html))
    sim = similarity(goodpage_html, badpage_html)
    return (True, sim)
    #except Exception as error:
    return (False, error)

コード例 #3

0

ファイルを表示

ファイル: duplicate_content.py プロジェクト: netzbegruenung/green-spider

    def compare_pairwise(self, content):
        # compair pairwise
        pairs = {}

        for url1 in content:
            for url2 in content:

                if url1 == url2:
                    continue

                # avoid checking pairs twice
                pair_key = " ".join(sorted([url1, url2]))
                if pair_key in pairs:
                    continue

                try:
                    s = html_similarity.similarity(content[url1],
                                                   content[url2])
                    logging.debug(
                        "Comparing pages for URLs %s and %s: similarity=%s",
                        url1, url2, s)
                    pairs[pair_key] = {
                        'similarity': s,
                        'exception': None,
                    }
                except (AttributeError, ValueError) as e:
                    logging.error(
                        "html_similarity.similarity thre exception for URL pair %s and %s: %s",
                        url1, url2, e)
                    pairs[pair_key] = {
                        'similarity': None,
                        'exception': str(e),
                    }

        return pairs

コード例 #4

0

ファイルを表示

def make_score_similarity_file(structural_weight, similarity_file_output):
    results = []

    html_paths = glob.glob('{}/*.html'.format(HTML_CLUSTER_DATA_DIRECTORY))

    for file_path_1, file_path_2 in combinations(html_paths, 2):
        # TODO: Remove the data directory
        print('Calculating the similarity of {} and {}'.format(file_path_1, file_path_2))
        with open(file_path_1) as file_1, open(file_path_2) as file_2:
            html_1 = file_1.read()
            html_2 = file_2.read()

            similarity_score = similarity(html_1, html_2, k=structural_weight) * 100
            click.echo(
                '   The similarity between them is ' + click.style('{0:.2g}%'.format(
                    similarity_score), fg=similarity_color(similarity_score)
                )
            )
            results.append({
                'path1': file_path_1,
                'path2': file_path_2,
                'similarity': similarity_score
            })

    with open(similarity_file_output, 'w') as json_out:
        json.dump(results, json_out, indent=4)

コード例 #5

0

ファイルを表示

ファイル: command_cloudsolve.py プロジェクト: secRetDBot/secRet_dBot

def find_origins(domain, candidates):
    original_response = retrieve_original_page(domain)
    host_header_value = original_response.url.replace('https://',
                                                      '').split('/')[0]
    origins = []
    for host in candidates:
        try:
            url = 'https://' + host
            headers = {'Host': host_header_value}
            response = requests.get(url,
                                    timeout=3,
                                    headers=headers,
                                    verify=False)
        except requests.exceptions.Timeout:
            continue
        except requests.exceptions.RequestException:
            continue
        if response.status_code != 200:
            continue
        if response.text == original_response.text:
            origins.append((host, 'HTML content identical to %s' % domain))
            continue
        page_similarity = similarity(response.text, original_response.text)
        if page_similarity > 0.9:
            origins.append(
                (host, 'HTML content is %d %% structurally similar to %s' %
                 (round(100 * page_similarity, 2), domain)))
    return origins

コード例 #6

0

ファイルを表示

ファイル: text_discriminator.py プロジェクト: gieoon/Generate-Websites-with-AI

def Assess_HTML_SIM(generatedHTML, targetHTML):
    from html_similarity import style_similarity, structural_similarity, similarity
    print("---------------------------")
    print(style_similarity(generatedHTML, targetHTML))
    print("---------------------------")
    print(structural_similarity(generatedHTML, targetHTML))
    print("---------------------------")
    print(similarity(generatedHTML, targetHTML))

コード例 #7

0

ファイルを表示

ファイル: views.py プロジェクト: karayel/crawler-challenge

    def create(self, request, *args, **kwargs):
        serializer = self.get_serializer(data=request.data)
        serializer.is_valid(raise_exception=True)

        from_product: Product = serializer.validated_data['from_product']
        to_product: Product = serializer.validated_data['to_product']

        result = similarity(from_product.html_content, to_product.html_content)

        return Response(data={'similarity': result})

コード例 #8

0

ファイルを表示

def find_origins(domain, candidates):
    print('\n[*] Testing candidate origin servers')
    original_response = retrieve_original_page(domain)
    host_header_value = original_response.url.replace('https://',
                                                      '').split('/')[0]
    origins = []
    for host in candidates:
        try:
            print('  - %s' % host)
            url = 'https://' + host
            headers = {
                'Host':
                host_header_value  # only keep the TLD, without any slashes
                ,
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Upgrade-Insecure-Requests": "1",
                "User-Agent":
                "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0",
                "Connection": "close",
                "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding": "gzip, deflate"
            }
            response = requests.get(url,
                                    timeout=config['http_timeout_seconds'],
                                    headers=headers,
                                    verify=False)
        except requests.exceptions.Timeout:
            print('      timed out after %d seconds' %
                  config['http_timeout_seconds'])
            continue
        except requests.exceptions.RequestException as e:
            print('      unable to retrieve')
            continue

        if response.status_code != 200:
            print('      responded with an unexpected HTTP status code %d' %
                  response.status_code)
            continue

        if response.text == original_response.text:
            origins.append((host, 'HTML content identical to %s' % domain))
            continue

        page_similarity = similarity(response.text, original_response.text)
        if page_similarity > config['response_similarity_threshold']:
            origins.append(
                (host, 'HTML content is %d %% structurally similar to %s' %
                 (round(100 * page_similarity, 2), domain)))

    return origins

コード例 #9

0

ファイルを表示

def find_origins(domain, candidates):
    print('\n[*] Testing candidate origin servers')
    original_response = retrieve_original_page(domain)
    host_header_value = original_response.url.replace('https://',
                                                      '').split('/')[0]
    origins = []
    for host in candidates:
        try:
            print('  - %s' % host)
            url = 'https://' + host
            headers = {
                'Host':
                host_header_value,  # only keep the TLD, without any slashes
                'User-Agent': get_user_agent()
            }
            response = requests.get(url,
                                    timeout=config['http_timeout_seconds'],
                                    headers=headers,
                                    verify=False)
        except requests.exceptions.Timeout:
            print('      timed out after %d seconds' %
                  config['http_timeout_seconds'])
            continue
        except requests.exceptions.RequestException as e:
            print('      unable to retrieve')
            continue

        if response.status_code != 200:
            print('      responded with an unexpected HTTP status code %d' %
                  response.status_code)
            continue

        if response.text == original_response.text:
            origins.append((host, 'HTML content identical to %s' % domain))
            continue

        if len(response.text) > 0:
            try:
                page_similarity = similarity(response.text,
                                             original_response.text)
            except:
                page_similarity = 0

            if page_similarity > config['response_similarity_threshold']:
                origins.append(
                    (host, 'HTML content is %d %% structurally similar to %s' %
                     (round(100 * page_similarity, 2), domain)))

    return origins

コード例 #10

0

ファイルを表示

ファイル: gen_sample.py プロジェクト: hiropppe/strand-aligner

def main(lang1, lang2, url1, url2, file1, file2, enc1, enc2):
    if url1 and url2:
        html1 = requests.get(url1).content.decode(
            enc1, "replace").replace("\n", " ").replace("\t", " ")
        html2 = requests.get(url2).content.decode(
            enc2, "replace").replace("\n", " ").replace("\t", " ")
    else:
        html1 = open(file1).read().replace("\n", " ").replace("\t", " ")
        html2 = open(file2).read().replace("\n", " ").replace("\t", " ")
        url1 = "http://test.com/{:s}".format(lang1)
        url2 = "http://test.com/{:s}".format(lang2)

    print("style_similarity={:f} structural_similarity={:f} similarity={:f}".format(
        style_similarity(html1, html2), structural_similarity(html1, html2), similarity(html1, html2)), file=sys.stderr)

    print("\t".join(["dummy_key", lang1, url1, html1, lang2, url2, html2]))

コード例 #11

0

ファイルを表示

def first_scan():
	try:
		print(Fore.YELLOW + Style.BRIGHT +"Cloudflare IP Catcher (Auto DIG)...\n")
		print(Fore.BLUE + "[*]" + Fore.RESET + " Checking if {0} are similar to {1}".format(ns, domain))
		test1 = requests.get('http://' + domain, timeout=config['http_timeout_seconds'])
		test2 = requests.get('http://' + ns, timeout=config['http_timeout_seconds'])
		page_similarity2 = similarity(test1.text, test2.text)
		if page_similarity2 > config['response_similarity_threshold']:
			print (((Fore.GREEN + '   [+]' + Fore.RESET) + ' HTML content is %d%% structurally similar to: %s' % (round(100 *page_similarity2, 2), domain)))
		else:
			print (((Fore.RED + '   [-]' + Fore.RESET + ' Sorry, but HTML content is %d%% structurally similar to %s' % (round(100 *page_similarity2, 2), domain))))
			print ("\n - Trying to check with IP... \n")
	except requests.exceptions.Timeout:
		sys.stderr.write((Fore.RED + "   [-]" + Fore.RESET) + " Connection cannot be established... Try to put manually a NS\n")
		exit(1)
	except requests.exceptions.Timeout:
		sys.stderr.write((Fore.RED + "   [-]" + Fore.RESET) + " Connection cannot be established... Try to put manually a NS\n")
		exit(1)

コード例 #12

0

ファイルを表示

def extract_features_from_website(url, label, predict):
    """
        extract all features from website, if predict set to true a pandas dataframe is created
    """

    try:
        global brand_list
        global phishy_list
        global login_list
        global tld_list
        # save original url for object instance
        url_orig = url

        # get different components of url
        components = get_url_components(url)

        fqdn = components[0]
        scheme = components[1]
        subdomain = components[2]
        domain = components[3]
        suffix = components[4]
        port = components[5]
        path = components[6]
        query = components[7]
        fragment = components[8]

        netloc = fqdn
        url_no_prot = url

        if scheme:
            netloc = scheme + "://" + fqdn

            if port:
                netloc = netloc + ":" + port

            url_no_prot = url.replace(scheme + "://", "", 1)

        # check for redirects of url
        resp_url, num_redirects, protocol, content = get_redirects(url)

        # try again if no connection could have been established
        if content == -1:
            time.sleep(3)
            resp_url, num_redirects, protocol, content = get_redirects(url)

            if content == -1:
                return None

        # get content for homepage
        hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects(
            "{}://www.{}.{}".format(scheme, domain, suffix))

        if hp_content == -1:
            time.sleep(3)
            hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects(
                "{}://www.{}.{}".format(scheme, domain, suffix))

        # read content in parser
        if not hp_content == -1:
            hp_soup = bs4.BeautifulSoup(hp_content.lower(), 'html.parser')

        soup = bs4.BeautifulSoup(content.lower(), 'html.parser')

        url = resp_url

        # number of redirects done by website
        if num_redirects > 0:
            bool_redirect_website = True
        else:
            bool_redirect_website = False

        # website has favicon/ check if website has favicon
        bool_favicon_website = False

        try:
            icon = favicon.get(url, timeout=3)
            bool_favicon_website = True
        except Exception as e:
            bool_favicon_website = False

        # website has links pointing to extern content
        bool_content_extern_website = False

        # number of links pointing to extern content
        int_links_extern_website = 0
        bool_content_extern_website, int_links_extern_website = find_extern_links(
            content.lower(), domain, suffix, url)

        # check for custom status bar
        bool_custom_statusbar_website = bool(
            str(content).lower().replace(" ",
                                         "").__contains__("window.status="))

        # custom right click
        bool_disable_rightclick_website = False

        if str(content).replace(
                " ", "").lower().__contains__("document.oncontextmenu="):
            bool_disable_rightclick_website = True

        res = soup.findAll("body")

        if res:
            for element in res:
                try:
                    right_click_arg = element['oncontextmenu']
                    if str(right_click_arg) == "return false":
                        bool_disable_right_click = True
                except Exception as e:
                    continue

        # has pop up window
        bool_popup_website = False
        hidden_count = 0
        res = soup.findAll("div")

        if res:
            for tag in res:
                try:
                    arg = tag['class']
                    if "popup" in arg:
                        bool_popup_website = True
                except Exception as e:
                    pass
                try:
                    arg = tag['style']
                    arg = str(arg).replace(" ", "")

                    if arg.__contains__("display:none") or arg.__contains__(
                            "visibility:hidden"):
                        hidden_count += 1
                except Exception as e:
                    continue

        # has iframe
        bool_iframe_website = False
        res = soup.findAll("iframe")
        if res:
            bool_iframe_website = True

        # has action tag > custom 2. feature - is action extern?
        bool_action_website = False
        bool_action_extern_website = False

        # has bool form post
        bool_form_post_website = False

        res = soup.findAll("form")

        if res:
            for element in res:
                try:
                    if element["action"]:
                        bool_action_website = True
                        action_url = element["action"]

                        if validate_url(action_url) or validate_url(
                                urljoin(netloc, action_url)):

                            if validate_url(urljoin(netloc, action_url)):
                                action_url = urljoin(netloc, action_url)

                            extracted_action_url = get_url_components(
                                action_url)

                            domain_action_url = extracted_action_url[3]
                            suffix_action_url = extracted_action_url[4]

                            if not suffix == suffix_action_url or not domain == domain_action_url:
                                bool_action_extern_website = True
                                break

                    if element["method"] == "post":
                        bool_form_post_website = True
                except Exception as e:
                    continue

        # has phishy tokens in visible content
        int_phishy_tokens_website = 0

        for text in soup.stripped_strings:
            int_phishy_tokens_website += sum(1 for word in phishy_list
                                             if text.__contains__(word))

        # has input tag
        bool_input_website = False
        if get_element_count("input", soup) > 0: bool_input_website = True

        # find meta description
        res = soup.find('meta', attrs={'name': 'og:description'})
        if not res:
            res = soup.find('meta', attrs={'property': 'description'})
        if not res:
            res = soup.find('meta', attrs={'name': 'description'})

        if not hp_content == -1:
            hp_res = hp_soup.find('meta', attrs={'name': 'og:description'})
            if not hp_res:
                hp_res = hp_soup.find('meta',
                                      attrs={'property': 'description'})
            if not hp_res:
                hp_res = hp_soup.find('meta', attrs={'name': 'description'})

        float_description_sim_website = 0

        if hp_content == -1:
            float_description_sim_website = -1

        if not hp_content == -1:
            if res and hp_res:
                try:
                    hp_desc = hp_res['content']
                    desc = res['content']

                    # compute similarity of description from home and login page
                    float_description_sim_website = string_similarity(
                        desc, hp_desc)
                except Exception:
                    pass

        # bond status login and homepage
        bool_bond_status_website = False

        # most frequent domain ist extern > tru/false
        bool_freq_domain_extern_website = False
        res = soup.findAll("a")
        domain_list = []
        link_list = []
        href_count = 0
        redirect_object_list = []

        if res:
            for a_tag in res:
                try:
                    href = a_tag.attrs.get("href")

                    href_count += 1

                    if validate_url(href) or validate_url(urljoin(
                            netloc, href)):

                        if validate_url(urljoin(netloc, href)):
                            href = urljoin(netloc, href)

                        if href == hp_url:
                            bool_bond_status_website = True

                        components_href = get_url_components(href)

                        domain_href = components_href[3]
                        suffix_href = components_href[4]

                        if is_IP(domain):
                            continue
                        link_list.append(href)
                        domain_list.append("{},{}".format(
                            domain_href, suffix_href))

                except Exception as e:
                    continue

            link_list = list(set(link_list))
            link_list = link_list[:10]
            if not hp_content == -1:
                try:
                    redirect_object_list = get_redirects_list(link_list)

                except Exception as e:
                    log(action_logging_enum=ERROR, logging_text=str(e))

                if redirect_object_list:
                    for redirect_object in redirect_object_list:

                        if not bool_bond_status_website and not hp_content == -1 and redirect_object_list:
                            try:
                                website_sim = html_similarity.similarity(
                                    str(hp_content).lower(),
                                    str(redirect_object.content).lower(),
                                    k=0.3)

                                if website_sim == 1:
                                    bool_bond_status_website = True
                            except Exception:
                                continue

        if domain_list:
            occure_count = Counter(domain_list)
            most_freq = occure_count.most_common(1)[0][0]
            most_frq_domain, most_freq_suffix = most_freq.split(",", 1)

            if not str(most_frq_domain) == domain or not str(
                    suffix) == most_freq_suffix:
                bool_freq_domain_extern_website = True

        # jaccard similarity between homepage and login page
        float_login_home_website = 0
        if not hp_content == -1:
            try:
                float_login_home_website = html_similarity.similarity(
                    str(content).lower(), str(hp_content).lower(), k=0.3)
            except Exception:
                pass
        # website has copyright
        bool_copyright_website = False

        # similarity from copyright of login page and home page
        copy = ""
        hp_copy = ""
        if not hp_content == -1:
            float_copyright_sim_website = 0
            for text in soup.stripped_strings:
                if '©' in text:
                    copy = re.sub(r'\s+', ' ', text)
                    bool_copyright_website = True

            for text in hp_soup.stripped_strings:
                if '©' in text:
                    hp_copy = re.sub(r'\s+', ' ', text)

            if copy and hp_copy:
                float_copyright_sim_website = string_similarity(copy, hp_copy)
        else:
            float_copyright_sim_website = 0

        # similarity from title of login page and home page
        float_title_sim_website = 0
        if not hp_content == -1:
            try:
                title = soup.title.text
                hp_title = hp_soup.title.text
                float_title_sim_website = string_similarity(title, hp_title)
            except Exception:
                float_title_sim_website = 0
                pass

        # unique links/all links on page
        float_unique_links_website = 0
        if link_list:
            float_unique_links_website = len(list(
                set(link_list))) / len(link_list)

        # lexical analysis for all links on website
        bool_link_analysis_website = True
        # dataframe = pd.DataFrame()
        # try:
        # redirect_object = RedirectEntry(url=url, redirects=num_redirects, content=content, protocol=protocol)
        # dataframe = pd.DataFrame(extract_features_from_URL(redirect_object, "Predict", brand_list=brand_list,
        # tld_list=tld_list, phishy_list=phishy_list, predict=True))
        # except Exception as e:
        # pass

        # if not dataframe.empty:
        # try:
        # df = pd.DataFrame(dataframe.iloc[0]).transpose()
        # prediction = predict_url(df)

        # if int(prediction) == 0:
        # bool_link_analysis_website = False
        # except Exception:
        # pass

        # number of input elements
        int_input_website = 0

        # find form accompanied by labels with loginwords
        bool_input_login_website = False
        form = soup.find("form")
        try:
            if form:
                inputs = form.find_all("input")

                if inputs:

                    int_input_website = len(inputs)

                    for inp in inputs:
                        try:
                            if inp["type"] == "hidden":
                                hidden_count += 1
                        except Exception:
                            continue

                    label_tags = form.findAll("label")

                    if label_tags:
                        for label_entry in label_tags:
                            if any(
                                    str(label_entry.text).__contains__(word)
                                    for word in login_list):
                                bool_input_login_website = True

        except Exception:
            pass

        # website has button
        bool_button_website = False
        button_count = get_element_count("button", soup)
        if button_count > 0:
            bool_button_website = True

        # website has meta information
        bool_meta_website = False

        if soup.find("meta"):
            bool_meta_website = True

        # has hidden elements
        bool_hidden_element_website = False
        if hidden_count > 0:
            bool_hidden_element_website = True

        # number of option tags
        int_option_website = get_element_count("option", soup)
        int_option_website = get_element_count("option", soup)

        # number select tags
        int_select_website = get_element_count("select", soup)

        # number th tags
        int_th_website = get_element_count("th", soup)

        # number of tr tags
        int_tr_website = get_element_count("tr", soup)

        # number of table tags
        int_table_website = get_element_count("table", soup)

        # number of href in a tag
        int_href_website = href_count

        # number of list item tags
        int_li_website = get_element_count("li", soup)

        # number of unordered list tags
        int_ul_website = get_element_count("ul", soup)

        # number of ordered list tags
        int_ol_website = get_element_count("ol", soup)

        # number of div tags
        int_div_website = get_element_count("div", soup)

        # number of span tags
        int_span_website = get_element_count("span", soup)

        # number of article tags
        int_article_website = get_element_count("article", soup)

        # number of p tags
        int_p_website = get_element_count("p", soup)

        # number of checkbox tags
        int_checkbox_website = get_element_count("input", soup, "type",
                                                 "checkbox")

        # number of buttons
        int_button_website = button_count

        # number of images
        int_image_website = get_element_count("img", soup)

        if predict == False:
            entry = FeatureEntryContent(
                bool_redirect_website=bool_redirect_website,
                bool_favicon_website=bool_favicon_website,
                bool_content_extern_website=bool_content_extern_website,
                int_links_extern_website=int_links_extern_website,
                bool_custom_statusbar_website=bool_custom_statusbar_website,
                bool_disable_rightclick_website=bool_disable_rightclick_website,
                bool_popup_website=bool_popup_website,
                bool_iframe_website=bool_iframe_website,
                bool_action_website=bool_action_website,
                bool_action_extern_website=bool_action_extern_website,
                bool_form_post_website=bool_form_post_website,
                int_phishy_tokens_website=int_phishy_tokens_website,
                bool_input_website=bool_input_website,
                float_description_sim_website=float_description_sim_website,
                bool_bond_status_website=bool_bond_status_website,
                bool_freq_domain_extern_website=bool_freq_domain_extern_website,
                float_login_home_website=float_login_home_website,
                bool_copyright_website=bool_copyright_website,
                float_copyright_sim_website=float_copyright_sim_website,
                float_title_sim_website=float_title_sim_website,
                float_unique_links_website=float_unique_links_website,
                # bool_link_analysis_website=bool_link_analysis_website,
                int_input_website=int_input_website,
                bool_input_login_website=bool_input_login_website,
                bool_button_website=bool_button_website,
                bool_meta_website=bool_meta_website,
                bool_hidden_element_website=bool_hidden_element_website,
                int_option_website=int_option_website,
                int_select_website=int_select_website,
                int_th_website=int_th_website,
                int_tr_website=int_tr_website,
                int_table_website=int_table_website,
                int_href_website=int_href_website,
                int_li_website=int_li_website,
                int_ul_website=int_ul_website,
                int_ol_website=int_ol_website,
                int_div_website=int_div_website,
                int_span_website=int_span_website,
                int_article_website=int_article_website,
                int_p_website=int_p_website,
                int_checkbox_website=int_checkbox_website,
                int_button_website=int_button_website,
                int_image_website=int_image_website,
                label=label,
                url=url_orig,
                final_url=url)

            log(action_logging_enum=INFO,
                logging_text="Processed datapoint. {}".format(url))

            return entry

        elif predict:
            data = {
                "ID": [0],
                "Has Redirect": [bool_redirect_website],
                "Has Favicon": [bool_favicon_website],
                "Has Extern Content": [bool_content_extern_website],
                "Number Extern Links": [int_links_extern_website],
                "Has Custom StatusBar": [bool_custom_statusbar_website],
                "Has Disabled RightClick": [bool_disable_rightclick_website],
                "Has PopUp": [bool_popup_website],
                "Has iFrame": [bool_iframe_website],
                "Has Action": [bool_action_website],
                "Has Extern Action": [bool_action_extern_website],
                "Has Form with POST": [bool_form_post_website],
                "Number PhishyTokens": [int_phishy_tokens_website],
                "Has Input": [bool_input_website],
                "Ratio Description Sim": [float_description_sim_website],
                "Has Bond Status": [bool_bond_status_website],
                "Has Freq Domain Extern": [bool_freq_domain_extern_website],
                "Ratio Similarity": [float_login_home_website],
                "Has Copyright": [bool_copyright_website],
                "Ratio Copyright Sim": [float_copyright_sim_website],
                "Ratio Title Sim": [float_title_sim_website],
                "Ratio Unique Links": [float_unique_links_website],
                "Number Inputs": [int_input_website],
                "Has Input for Login": [bool_input_login_website],
                "Has Button": [bool_button_website],
                "Has Meta": [bool_meta_website],
                "Has Hidden Element": [bool_hidden_element_website],
                "Number Option": [int_option_website],
                "Number Select": [int_select_website],
                "Number TH": [int_th_website],
                "Number TR": [int_tr_website],
                "Number Table": [int_table_website],
                "Number HREF": [int_href_website],
                "Number LI": [int_li_website],
                "Number UL": [int_ul_website],
                "Number OL": [int_ol_website],
                "Number DIV": [int_div_website],
                "Number Span": [int_span_website],
                "Number Article": [int_article_website],
                "Number Paragr": [int_p_website],
                "Number Checkbox": [int_checkbox_website],
                "Number Button": [int_checkbox_website],
                "Number Image": [int_image_website],
                "Label": [label],
                "URL": [url_orig],
                "Final URL": [url]
            }

            columns = list(CONTENT_FEATURE_LIST_COLUMN_NAMES)

            df = pd.DataFrame(data, columns=columns)

            return df

    except Exception as e:
        log(action_logging_enum=WARNING, logging_text=str(e))
        log(action_logging_enum=WARNING, logging_text=str(e.__traceback__))
        exc_type, exc_obj, tb = sys.exc_info()
        f = tb.tb_frame
        lineno = tb.tb_lineno
        filename = f.f_code.co_filename
        linecache.checkcache(filename)
        line = linecache.getline(filename, lineno, f.f_globals)
        log(
            ERROR, 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(
                filename, lineno, line.strip(), exc_obj))
        log(action_logging_enum=WARNING,
            logging_text="Could not extract content features for {}".format(
                url))

    log(action_logging_enum=INFO,
        logging_text="Failed datapoint. {}".format(url))
    return None

コード例 #13

0

ファイルを表示

ファイル: hash_tool.py プロジェクト: LukaZeleznik/wieramemo_vase

 def calculate_similarity(self, html_content1, html_content2):
     return similarity(html_content1, html_content2)