def Checking_IP(domain): A = Checking_DNS(ns, dns) print (Fore.GREEN + ' [+]' + Fore.RESET) + ' Possible IP:', A url = 'http://' + domain print((Fore.BLUE + '[*]') + Fore.RESET + ' Retrieving target homepage at: %s' % url) try: org_response = requests.get(url, timeout=config['http_timeout_seconds']) except requests.exceptions.Timeout: sys.stderr.write((Fore.RED + " [-]" + Fore.RESET) + " %s timed out after %d seconds.\n" % (url, config['http_timeout_seconds'])) exit(1) except requests.exceptions.RequestException as e: sys.stderr.write((Fore.RED + " [-]" + Fore.RESET) + " Failed to retrieve %s.\n" % url) exit(1) if org_response.status_code != 200: print (Fore.RED + ' [-]' + Fore.RESET) + ' %s responded with an unexpected HTTP status code %d' % (url, org_response.status_code) exit(1) if org_response.url != url: print ((Fore.GREEN + '[+]' + Fore.RESET) + ' %s redirects to %s.' % (url, org_response.url)) print (Fore.GREEN + " [+]" + Fore.RESET + " Request redirected successful to %s." % org_response.url) print ((Fore.BLUE + '[*]' + Fore.RESET + ' Testing if body content is the same in both websites.')) sec_response = requests.get('http://' + str(A), timeout=config['http_timeout_seconds']) if sec_response.status_code != 200: print (Fore.RED + ' [-]' + Fore.RESET) + ' %s responded with an unexpected HTTP status code %d' % (url, org_response.status_code) exit(1) if sec_response.text == org_response.text: print ((str(A), 'HTML content identical to %s' % domain)) page_similarity = similarity(sec_response.text, org_response.text) if page_similarity > config['response_similarity_threshold']: print (((Fore.GREEN + ' [+]' + Fore.RESET) + ' HTML content is %d%% structurally similar to: %s' % (round(100 *page_similarity, 2), org_response.url))) return org_response
def html_comparison(goodurl, badurl): if not goodurl.startswith("http://"): goodurl = "http://" + goodurl if not badurl.startswith("http://"): badurl = "http://" + badurl goodpage_html = html_cache.get(goodurl) badpage_html = html_cache.get(badurl) if not goodpage_html: try: r = requests.get(goodurl) html_cache[goodurl] = r.text except Exception as error: return (False, error) if not badpage_html: try: r = requests.get(badurl) html_cache[badurl] = r.text except Exception as error: return (False, error) #try: print("Testing {} {}".format(goodpage_html, badpage_html)) sim = similarity(goodpage_html, badpage_html) return (True, sim) #except Exception as error: return (False, error)
def compare_pairwise(self, content): # compair pairwise pairs = {} for url1 in content: for url2 in content: if url1 == url2: continue # avoid checking pairs twice pair_key = " ".join(sorted([url1, url2])) if pair_key in pairs: continue try: s = html_similarity.similarity(content[url1], content[url2]) logging.debug( "Comparing pages for URLs %s and %s: similarity=%s", url1, url2, s) pairs[pair_key] = { 'similarity': s, 'exception': None, } except (AttributeError, ValueError) as e: logging.error( "html_similarity.similarity thre exception for URL pair %s and %s: %s", url1, url2, e) pairs[pair_key] = { 'similarity': None, 'exception': str(e), } return pairs
def make_score_similarity_file(structural_weight, similarity_file_output): results = [] html_paths = glob.glob('{}/*.html'.format(HTML_CLUSTER_DATA_DIRECTORY)) for file_path_1, file_path_2 in combinations(html_paths, 2): # TODO: Remove the data directory print('Calculating the similarity of {} and {}'.format(file_path_1, file_path_2)) with open(file_path_1) as file_1, open(file_path_2) as file_2: html_1 = file_1.read() html_2 = file_2.read() similarity_score = similarity(html_1, html_2, k=structural_weight) * 100 click.echo( ' The similarity between them is ' + click.style('{0:.2g}%'.format( similarity_score), fg=similarity_color(similarity_score) ) ) results.append({ 'path1': file_path_1, 'path2': file_path_2, 'similarity': similarity_score }) with open(similarity_file_output, 'w') as json_out: json.dump(results, json_out, indent=4)
def find_origins(domain, candidates): original_response = retrieve_original_page(domain) host_header_value = original_response.url.replace('https://', '').split('/')[0] origins = [] for host in candidates: try: url = 'https://' + host headers = {'Host': host_header_value} response = requests.get(url, timeout=3, headers=headers, verify=False) except requests.exceptions.Timeout: continue except requests.exceptions.RequestException: continue if response.status_code != 200: continue if response.text == original_response.text: origins.append((host, 'HTML content identical to %s' % domain)) continue page_similarity = similarity(response.text, original_response.text) if page_similarity > 0.9: origins.append( (host, 'HTML content is %d %% structurally similar to %s' % (round(100 * page_similarity, 2), domain))) return origins
def Assess_HTML_SIM(generatedHTML, targetHTML): from html_similarity import style_similarity, structural_similarity, similarity print("---------------------------") print(style_similarity(generatedHTML, targetHTML)) print("---------------------------") print(structural_similarity(generatedHTML, targetHTML)) print("---------------------------") print(similarity(generatedHTML, targetHTML))
def create(self, request, *args, **kwargs): serializer = self.get_serializer(data=request.data) serializer.is_valid(raise_exception=True) from_product: Product = serializer.validated_data['from_product'] to_product: Product = serializer.validated_data['to_product'] result = similarity(from_product.html_content, to_product.html_content) return Response(data={'similarity': result})
def find_origins(domain, candidates): print('\n[*] Testing candidate origin servers') original_response = retrieve_original_page(domain) host_header_value = original_response.url.replace('https://', '').split('/')[0] origins = [] for host in candidates: try: print(' - %s' % host) url = 'https://' + host headers = { 'Host': host_header_value # only keep the TLD, without any slashes , "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0", "Connection": "close", "Accept-Language": "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3", "Accept-Encoding": "gzip, deflate" } response = requests.get(url, timeout=config['http_timeout_seconds'], headers=headers, verify=False) except requests.exceptions.Timeout: print(' timed out after %d seconds' % config['http_timeout_seconds']) continue except requests.exceptions.RequestException as e: print(' unable to retrieve') continue if response.status_code != 200: print(' responded with an unexpected HTTP status code %d' % response.status_code) continue if response.text == original_response.text: origins.append((host, 'HTML content identical to %s' % domain)) continue page_similarity = similarity(response.text, original_response.text) if page_similarity > config['response_similarity_threshold']: origins.append( (host, 'HTML content is %d %% structurally similar to %s' % (round(100 * page_similarity, 2), domain))) return origins
def find_origins(domain, candidates): print('\n[*] Testing candidate origin servers') original_response = retrieve_original_page(domain) host_header_value = original_response.url.replace('https://', '').split('/')[0] origins = [] for host in candidates: try: print(' - %s' % host) url = 'https://' + host headers = { 'Host': host_header_value, # only keep the TLD, without any slashes 'User-Agent': get_user_agent() } response = requests.get(url, timeout=config['http_timeout_seconds'], headers=headers, verify=False) except requests.exceptions.Timeout: print(' timed out after %d seconds' % config['http_timeout_seconds']) continue except requests.exceptions.RequestException as e: print(' unable to retrieve') continue if response.status_code != 200: print(' responded with an unexpected HTTP status code %d' % response.status_code) continue if response.text == original_response.text: origins.append((host, 'HTML content identical to %s' % domain)) continue if len(response.text) > 0: try: page_similarity = similarity(response.text, original_response.text) except: page_similarity = 0 if page_similarity > config['response_similarity_threshold']: origins.append( (host, 'HTML content is %d %% structurally similar to %s' % (round(100 * page_similarity, 2), domain))) return origins
def main(lang1, lang2, url1, url2, file1, file2, enc1, enc2): if url1 and url2: html1 = requests.get(url1).content.decode( enc1, "replace").replace("\n", " ").replace("\t", " ") html2 = requests.get(url2).content.decode( enc2, "replace").replace("\n", " ").replace("\t", " ") else: html1 = open(file1).read().replace("\n", " ").replace("\t", " ") html2 = open(file2).read().replace("\n", " ").replace("\t", " ") url1 = "http://test.com/{:s}".format(lang1) url2 = "http://test.com/{:s}".format(lang2) print("style_similarity={:f} structural_similarity={:f} similarity={:f}".format( style_similarity(html1, html2), structural_similarity(html1, html2), similarity(html1, html2)), file=sys.stderr) print("\t".join(["dummy_key", lang1, url1, html1, lang2, url2, html2]))
def first_scan(): try: print(Fore.YELLOW + Style.BRIGHT +"Cloudflare IP Catcher (Auto DIG)...\n") print(Fore.BLUE + "[*]" + Fore.RESET + " Checking if {0} are similar to {1}".format(ns, domain)) test1 = requests.get('http://' + domain, timeout=config['http_timeout_seconds']) test2 = requests.get('http://' + ns, timeout=config['http_timeout_seconds']) page_similarity2 = similarity(test1.text, test2.text) if page_similarity2 > config['response_similarity_threshold']: print (((Fore.GREEN + ' [+]' + Fore.RESET) + ' HTML content is %d%% structurally similar to: %s' % (round(100 *page_similarity2, 2), domain))) else: print (((Fore.RED + ' [-]' + Fore.RESET + ' Sorry, but HTML content is %d%% structurally similar to %s' % (round(100 *page_similarity2, 2), domain)))) print ("\n - Trying to check with IP... \n") except requests.exceptions.Timeout: sys.stderr.write((Fore.RED + " [-]" + Fore.RESET) + " Connection cannot be established... Try to put manually a NS\n") exit(1) except requests.exceptions.Timeout: sys.stderr.write((Fore.RED + " [-]" + Fore.RESET) + " Connection cannot be established... Try to put manually a NS\n") exit(1)
def extract_features_from_website(url, label, predict): """ extract all features from website, if predict set to true a pandas dataframe is created """ try: global brand_list global phishy_list global login_list global tld_list # save original url for object instance url_orig = url # get different components of url components = get_url_components(url) fqdn = components[0] scheme = components[1] subdomain = components[2] domain = components[3] suffix = components[4] port = components[5] path = components[6] query = components[7] fragment = components[8] netloc = fqdn url_no_prot = url if scheme: netloc = scheme + "://" + fqdn if port: netloc = netloc + ":" + port url_no_prot = url.replace(scheme + "://", "", 1) # check for redirects of url resp_url, num_redirects, protocol, content = get_redirects(url) # try again if no connection could have been established if content == -1: time.sleep(3) resp_url, num_redirects, protocol, content = get_redirects(url) if content == -1: return None # get content for homepage hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects( "{}://www.{}.{}".format(scheme, domain, suffix)) if hp_content == -1: time.sleep(3) hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects( "{}://www.{}.{}".format(scheme, domain, suffix)) # read content in parser if not hp_content == -1: hp_soup = bs4.BeautifulSoup(hp_content.lower(), 'html.parser') soup = bs4.BeautifulSoup(content.lower(), 'html.parser') url = resp_url # number of redirects done by website if num_redirects > 0: bool_redirect_website = True else: bool_redirect_website = False # website has favicon/ check if website has favicon bool_favicon_website = False try: icon = favicon.get(url, timeout=3) bool_favicon_website = True except Exception as e: bool_favicon_website = False # website has links pointing to extern content bool_content_extern_website = False # number of links pointing to extern content int_links_extern_website = 0 bool_content_extern_website, int_links_extern_website = find_extern_links( content.lower(), domain, suffix, url) # check for custom status bar bool_custom_statusbar_website = bool( str(content).lower().replace(" ", "").__contains__("window.status=")) # custom right click bool_disable_rightclick_website = False if str(content).replace( " ", "").lower().__contains__("document.oncontextmenu="): bool_disable_rightclick_website = True res = soup.findAll("body") if res: for element in res: try: right_click_arg = element['oncontextmenu'] if str(right_click_arg) == "return false": bool_disable_right_click = True except Exception as e: continue # has pop up window bool_popup_website = False hidden_count = 0 res = soup.findAll("div") if res: for tag in res: try: arg = tag['class'] if "popup" in arg: bool_popup_website = True except Exception as e: pass try: arg = tag['style'] arg = str(arg).replace(" ", "") if arg.__contains__("display:none") or arg.__contains__( "visibility:hidden"): hidden_count += 1 except Exception as e: continue # has iframe bool_iframe_website = False res = soup.findAll("iframe") if res: bool_iframe_website = True # has action tag > custom 2. feature - is action extern? bool_action_website = False bool_action_extern_website = False # has bool form post bool_form_post_website = False res = soup.findAll("form") if res: for element in res: try: if element["action"]: bool_action_website = True action_url = element["action"] if validate_url(action_url) or validate_url( urljoin(netloc, action_url)): if validate_url(urljoin(netloc, action_url)): action_url = urljoin(netloc, action_url) extracted_action_url = get_url_components( action_url) domain_action_url = extracted_action_url[3] suffix_action_url = extracted_action_url[4] if not suffix == suffix_action_url or not domain == domain_action_url: bool_action_extern_website = True break if element["method"] == "post": bool_form_post_website = True except Exception as e: continue # has phishy tokens in visible content int_phishy_tokens_website = 0 for text in soup.stripped_strings: int_phishy_tokens_website += sum(1 for word in phishy_list if text.__contains__(word)) # has input tag bool_input_website = False if get_element_count("input", soup) > 0: bool_input_website = True # find meta description res = soup.find('meta', attrs={'name': 'og:description'}) if not res: res = soup.find('meta', attrs={'property': 'description'}) if not res: res = soup.find('meta', attrs={'name': 'description'}) if not hp_content == -1: hp_res = hp_soup.find('meta', attrs={'name': 'og:description'}) if not hp_res: hp_res = hp_soup.find('meta', attrs={'property': 'description'}) if not hp_res: hp_res = hp_soup.find('meta', attrs={'name': 'description'}) float_description_sim_website = 0 if hp_content == -1: float_description_sim_website = -1 if not hp_content == -1: if res and hp_res: try: hp_desc = hp_res['content'] desc = res['content'] # compute similarity of description from home and login page float_description_sim_website = string_similarity( desc, hp_desc) except Exception: pass # bond status login and homepage bool_bond_status_website = False # most frequent domain ist extern > tru/false bool_freq_domain_extern_website = False res = soup.findAll("a") domain_list = [] link_list = [] href_count = 0 redirect_object_list = [] if res: for a_tag in res: try: href = a_tag.attrs.get("href") href_count += 1 if validate_url(href) or validate_url(urljoin( netloc, href)): if validate_url(urljoin(netloc, href)): href = urljoin(netloc, href) if href == hp_url: bool_bond_status_website = True components_href = get_url_components(href) domain_href = components_href[3] suffix_href = components_href[4] if is_IP(domain): continue link_list.append(href) domain_list.append("{},{}".format( domain_href, suffix_href)) except Exception as e: continue link_list = list(set(link_list)) link_list = link_list[:10] if not hp_content == -1: try: redirect_object_list = get_redirects_list(link_list) except Exception as e: log(action_logging_enum=ERROR, logging_text=str(e)) if redirect_object_list: for redirect_object in redirect_object_list: if not bool_bond_status_website and not hp_content == -1 and redirect_object_list: try: website_sim = html_similarity.similarity( str(hp_content).lower(), str(redirect_object.content).lower(), k=0.3) if website_sim == 1: bool_bond_status_website = True except Exception: continue if domain_list: occure_count = Counter(domain_list) most_freq = occure_count.most_common(1)[0][0] most_frq_domain, most_freq_suffix = most_freq.split(",", 1) if not str(most_frq_domain) == domain or not str( suffix) == most_freq_suffix: bool_freq_domain_extern_website = True # jaccard similarity between homepage and login page float_login_home_website = 0 if not hp_content == -1: try: float_login_home_website = html_similarity.similarity( str(content).lower(), str(hp_content).lower(), k=0.3) except Exception: pass # website has copyright bool_copyright_website = False # similarity from copyright of login page and home page copy = "" hp_copy = "" if not hp_content == -1: float_copyright_sim_website = 0 for text in soup.stripped_strings: if '©' in text: copy = re.sub(r'\s+', ' ', text) bool_copyright_website = True for text in hp_soup.stripped_strings: if '©' in text: hp_copy = re.sub(r'\s+', ' ', text) if copy and hp_copy: float_copyright_sim_website = string_similarity(copy, hp_copy) else: float_copyright_sim_website = 0 # similarity from title of login page and home page float_title_sim_website = 0 if not hp_content == -1: try: title = soup.title.text hp_title = hp_soup.title.text float_title_sim_website = string_similarity(title, hp_title) except Exception: float_title_sim_website = 0 pass # unique links/all links on page float_unique_links_website = 0 if link_list: float_unique_links_website = len(list( set(link_list))) / len(link_list) # lexical analysis for all links on website bool_link_analysis_website = True # dataframe = pd.DataFrame() # try: # redirect_object = RedirectEntry(url=url, redirects=num_redirects, content=content, protocol=protocol) # dataframe = pd.DataFrame(extract_features_from_URL(redirect_object, "Predict", brand_list=brand_list, # tld_list=tld_list, phishy_list=phishy_list, predict=True)) # except Exception as e: # pass # if not dataframe.empty: # try: # df = pd.DataFrame(dataframe.iloc[0]).transpose() # prediction = predict_url(df) # if int(prediction) == 0: # bool_link_analysis_website = False # except Exception: # pass # number of input elements int_input_website = 0 # find form accompanied by labels with loginwords bool_input_login_website = False form = soup.find("form") try: if form: inputs = form.find_all("input") if inputs: int_input_website = len(inputs) for inp in inputs: try: if inp["type"] == "hidden": hidden_count += 1 except Exception: continue label_tags = form.findAll("label") if label_tags: for label_entry in label_tags: if any( str(label_entry.text).__contains__(word) for word in login_list): bool_input_login_website = True except Exception: pass # website has button bool_button_website = False button_count = get_element_count("button", soup) if button_count > 0: bool_button_website = True # website has meta information bool_meta_website = False if soup.find("meta"): bool_meta_website = True # has hidden elements bool_hidden_element_website = False if hidden_count > 0: bool_hidden_element_website = True # number of option tags int_option_website = get_element_count("option", soup) int_option_website = get_element_count("option", soup) # number select tags int_select_website = get_element_count("select", soup) # number th tags int_th_website = get_element_count("th", soup) # number of tr tags int_tr_website = get_element_count("tr", soup) # number of table tags int_table_website = get_element_count("table", soup) # number of href in a tag int_href_website = href_count # number of list item tags int_li_website = get_element_count("li", soup) # number of unordered list tags int_ul_website = get_element_count("ul", soup) # number of ordered list tags int_ol_website = get_element_count("ol", soup) # number of div tags int_div_website = get_element_count("div", soup) # number of span tags int_span_website = get_element_count("span", soup) # number of article tags int_article_website = get_element_count("article", soup) # number of p tags int_p_website = get_element_count("p", soup) # number of checkbox tags int_checkbox_website = get_element_count("input", soup, "type", "checkbox") # number of buttons int_button_website = button_count # number of images int_image_website = get_element_count("img", soup) if predict == False: entry = FeatureEntryContent( bool_redirect_website=bool_redirect_website, bool_favicon_website=bool_favicon_website, bool_content_extern_website=bool_content_extern_website, int_links_extern_website=int_links_extern_website, bool_custom_statusbar_website=bool_custom_statusbar_website, bool_disable_rightclick_website=bool_disable_rightclick_website, bool_popup_website=bool_popup_website, bool_iframe_website=bool_iframe_website, bool_action_website=bool_action_website, bool_action_extern_website=bool_action_extern_website, bool_form_post_website=bool_form_post_website, int_phishy_tokens_website=int_phishy_tokens_website, bool_input_website=bool_input_website, float_description_sim_website=float_description_sim_website, bool_bond_status_website=bool_bond_status_website, bool_freq_domain_extern_website=bool_freq_domain_extern_website, float_login_home_website=float_login_home_website, bool_copyright_website=bool_copyright_website, float_copyright_sim_website=float_copyright_sim_website, float_title_sim_website=float_title_sim_website, float_unique_links_website=float_unique_links_website, # bool_link_analysis_website=bool_link_analysis_website, int_input_website=int_input_website, bool_input_login_website=bool_input_login_website, bool_button_website=bool_button_website, bool_meta_website=bool_meta_website, bool_hidden_element_website=bool_hidden_element_website, int_option_website=int_option_website, int_select_website=int_select_website, int_th_website=int_th_website, int_tr_website=int_tr_website, int_table_website=int_table_website, int_href_website=int_href_website, int_li_website=int_li_website, int_ul_website=int_ul_website, int_ol_website=int_ol_website, int_div_website=int_div_website, int_span_website=int_span_website, int_article_website=int_article_website, int_p_website=int_p_website, int_checkbox_website=int_checkbox_website, int_button_website=int_button_website, int_image_website=int_image_website, label=label, url=url_orig, final_url=url) log(action_logging_enum=INFO, logging_text="Processed datapoint. {}".format(url)) return entry elif predict: data = { "ID": [0], "Has Redirect": [bool_redirect_website], "Has Favicon": [bool_favicon_website], "Has Extern Content": [bool_content_extern_website], "Number Extern Links": [int_links_extern_website], "Has Custom StatusBar": [bool_custom_statusbar_website], "Has Disabled RightClick": [bool_disable_rightclick_website], "Has PopUp": [bool_popup_website], "Has iFrame": [bool_iframe_website], "Has Action": [bool_action_website], "Has Extern Action": [bool_action_extern_website], "Has Form with POST": [bool_form_post_website], "Number PhishyTokens": [int_phishy_tokens_website], "Has Input": [bool_input_website], "Ratio Description Sim": [float_description_sim_website], "Has Bond Status": [bool_bond_status_website], "Has Freq Domain Extern": [bool_freq_domain_extern_website], "Ratio Similarity": [float_login_home_website], "Has Copyright": [bool_copyright_website], "Ratio Copyright Sim": [float_copyright_sim_website], "Ratio Title Sim": [float_title_sim_website], "Ratio Unique Links": [float_unique_links_website], "Number Inputs": [int_input_website], "Has Input for Login": [bool_input_login_website], "Has Button": [bool_button_website], "Has Meta": [bool_meta_website], "Has Hidden Element": [bool_hidden_element_website], "Number Option": [int_option_website], "Number Select": [int_select_website], "Number TH": [int_th_website], "Number TR": [int_tr_website], "Number Table": [int_table_website], "Number HREF": [int_href_website], "Number LI": [int_li_website], "Number UL": [int_ul_website], "Number OL": [int_ol_website], "Number DIV": [int_div_website], "Number Span": [int_span_website], "Number Article": [int_article_website], "Number Paragr": [int_p_website], "Number Checkbox": [int_checkbox_website], "Number Button": [int_checkbox_website], "Number Image": [int_image_website], "Label": [label], "URL": [url_orig], "Final URL": [url] } columns = list(CONTENT_FEATURE_LIST_COLUMN_NAMES) df = pd.DataFrame(data, columns=columns) return df except Exception as e: log(action_logging_enum=WARNING, logging_text=str(e)) log(action_logging_enum=WARNING, logging_text=str(e.__traceback__)) exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) log( ERROR, 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format( filename, lineno, line.strip(), exc_obj)) log(action_logging_enum=WARNING, logging_text="Could not extract content features for {}".format( url)) log(action_logging_enum=INFO, logging_text="Failed datapoint. {}".format(url)) return None
def calculate_similarity(self, html_content1, html_content2): return similarity(html_content1, html_content2)