def getmisp_urls(key, url, timeframe): response_domains = [] headers = { 'Authorization': '{}'.format(key), 'Content-type': 'application/json', 'Accept': 'application/json' } payload = '{ "returnFormat": "json", "type": "url", "last": "%s", "enforceWarninglist": true }' % timeframe response = requests.post(url, headers=headers, data=payload, verify=False) json_response = json.loads(response.text) fp = Faup() try: for attr in json_response['response']['Attribute']: url = attr['value'] eventid = attr['event_id'] if eventid not in ignore_eventid: category = attr['category'] timestamp = datetime.datetime.utcfromtimestamp( int(attr['timestamp'])).strftime('%Y-%m-%d') fp.decode(url) domain = fp.get_domain() if re.match(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain): response_domains.append({ 'domain': domain, 'eventid': eventid, 'category': category, 'timestamp': timestamp }) return response_domains except: return response_domains
def __init__(self, misp_url, misp_key, verifycert, config, offline=False, urlsonly=False): self.offline = offline if not self.offline: self.misp = ExpandedPyMISP(misp_url, misp_key, verifycert, debug=config.debug) self.config = config self.urlsonly = urlsonly if not hasattr(self.config, 'enable_dns'): setattr(self.config, 'enable_dns', True) if self.urlsonly is False: setattr(self.config, 'enable_dns', False) self.debug = self.config.debug self.config_from_email_body = {} if not hasattr(self.config, 'ignore_nullsize_attachments'): setattr(self.config, 'ignore_nullsize_attachments', False) self.ignore_nullsize_attachments = self.config.ignore_nullsize_attachments # Init Faup self.f = Faup() self.sightings_to_add = []
def __init__(self): super(Credential, self).__init__() self.faup = Faup() self.regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" self.regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+" self.regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) # Database config_loader = ConfigLoader.ConfigLoader() self.server_cred = config_loader.get_redis_conn("ARDB_TermCred") self.server_statistics = config_loader.get_redis_conn("ARDB_Statistics") # Config values self.minimumLengthThreshold = config_loader.get_config_int("Credential", "minimumLengthThreshold") self.criticalNumberToAlert = config_loader.get_config_int("Credential", "criticalNumberToAlert") self.max_execution_time = 30 # Waiting time in secondes between to message proccessed self.pending_seconds = 10 # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized")
def run(self): i = 0 while (True): i = i + 1 if i % 1000 == 0: time.sleep(10) url = self.r.rpop('crawl') fex = Faup() if url: print "url found: " + url fex.decode(url) domain = fex.get_host() entry = self.db.new_domaines.find_one({'domaine': domain}) if entry == None: print "record: " + domain self.db.new_domaines.save({ 'domaine': domain, 'urls': [url] }) urls_stored = entry['urls'] if not url in urls_stored: urls_stored.append(url) entry['urls'] = urls_stored self.db.new_domaines.save(entry)
def run(self): i = 0 while (True): i = i + 1 if i % 1000 == 0: time.sleep(10) self.lock.acquire() self.r.switchDB(1) url = self.r.rpop('crawl') self.lock.release() # print url fex = Faup() if url: print "url found: " + url try: fex.decode(url) domain = fex.get_host() entry = self.db.new_domaines.find_one({'domaine': domain}) if entry == None: print "record: " + domain self.db.new_domaines.save({ 'domaine': domain, 'urls': [url] }) urls_stored = entry['urls'] if not url in urls_stored: urls_stored.append(url) entry['urls'] = urls_stored self.db.new_domaines.save(entry) except: print "parsing fault " + url
def dns_resolve(url): cached = _cache_get(url, 'dns') if cached is not None: return cached fex = Faup() fex.decode(url) host = fex.get_host().lower() ipv4 = None ipv6 = None if is_ip(host): if ':' in host: try: socket.inet_pton(socket.AF_INET6, host) ipv6 = [host] except: pass else: try: socket.inet_aton(host) ipv4 = [host] except: pass else: try: ipv4 = [str(ip) for ip in dns.resolver.query(host, 'A')] except: logging.debug("No IPv4 address assigned to: " + host) try: ipv6 = [str(ip) for ip in dns.resolver.query(host, 'AAAA')] except: logging.debug("No IPv6 address assigned to: " + host) _cache_set(url, (ipv4, ipv6), 'dns') return ipv4, ipv6
def whois(server, port, domain, ignorelist, replacelist): cached = _cache_get(domain, 'whois') if cached is not None: return cached s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(15) try: s.connect((server, port)) except Exception: print("Connection problems - check WHOIS server") print(("WHOIS request while problem occurred: ", domain)) print(("WHOIS server: {}:{}".format(server, port))) sys.exit(0) if domain.startswith('http'): fex = Faup() fex.decode(domain) d = fex.get_domain().lower() else: d = domain s.send(d + "\r\n") response = '' while True: d = s.recv(4096) response += d if d == '': break s.close() match = re.findall(r'[\w\.-]+@[\w\.-]+', response) emails = process_emails(match, ignorelist, replacelist) if len(emails) == 0: return None list_mail = list(set(emails)) _cache_set(domain, list_mail, 'whois') return list_mail
def tld_extract(domain): if "_faup" not in __builtins__: __builtins__["_faup"] = Faup() _faup = __builtins__["_faup"] _faup.decode(domain.decode("utf-8").strip(b".")) return (_faup.get_subdomain() or b"", _faup.get_domain_without_tld() or b"", _faup.get_tld() or b"")
def __init__(self): """ Init Urls """ super(Urls, self).__init__() self.faup = Faup() self.redis_cache_key = regex_helper.generate_redis_cache_key( self.module_name) # Protocol file path protocolsfile_path = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolsfile")) # Get all uri from protocolsfile (Used for Curve) uri_scheme = "" with open(protocolsfile_path, 'r') as scheme_file: for scheme in scheme_file: uri_scheme += scheme[:-1] + "|" uri_scheme = uri_scheme[:-1] self.url_regex = "((?i:"+uri_scheme + \ ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)" # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized")
def get_urls(url, depth=1): if depth > 5: print('Too many redirects.') return fex = Faup() def meta_redirect(content): c = content.lower() soup = BeautifulSoup(c, "html.parser") for result in soup.find_all(attrs={'http-equiv': 'refresh'}): if result: out = result["content"].split(";") if len(out) == 2: wait, text = out a, url = text.split('=', 1) return url.strip() return None resolve, reason = try_resolve(fex, url) if not resolve: # FIXME: inform that the domain does not resolve yield url return logging.debug("Making HTTP connection to " + url) headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'} try: response = requests.get(url, allow_redirects=True, headers=headers, timeout=15, verify=False) except: # That one can fail (DNS for example) # FIXME: inform that the get failed yield url return if response.history is not None: for h in response.history: # Yeld the urls in the order we find them yield h.url yield response.url meta_redir_url = meta_redirect(response.content) if meta_redir_url is not None: depth += 1 if not meta_redir_url.startswith('http'): fex.decode(url) base = '{}://{}'.format(fex.get_scheme(), fex.get_host()) port = fex.get_port() if port is not None: base += ':{}'.format(port) if not meta_redir_url.startswith('/'): # relative redirect. resource_path has the initial '/' if fex.get_resource_path() is not None: base += fex.get_resource_path() if not base.endswith('/'): base += '/' meta_redir_url = base + meta_redir_url for url in get_urls(meta_redir_url, depth): yield url
def __post_init__(self): if self.domain is None: f = Faup( ) # Example code at https://programtalk.com/python-examples-amp/pyfaup.faup.Faup/ f.decode(self.address.split("@")[-1]) self.top_level_domain = f.get_tld() self.domain = f.get_domain() self.subdomain = f.get_subdomain()
def __init__(self): super(LibInjection, self).__init__() self.faup = Faup() config_loader = ConfigLoader() self.server_statistics = config_loader.get_redis_conn("ARDB_Statistics") self.redis_logger.info(f"Module: {self.module_name} Launched")
def process(self): list_domains = self.db['new_domaines'].distinct('domaine') fex = Faup() for domain in list_domains: url = 'http://' + str(domain) fex.decode(url, False) print(fex.get_tld() + ',' + fex.get_domain() + ',' + ','.join(fex.get_subdomain().split('.')[::-1]).replace( 'www', '')).replace(',,', ',')
def __post_init__(self): f = Faup( ) # Example code at https://programtalk.com/python-examples-amp/pyfaup.faup.Faup/ f.decode(self.url) self.scheme = f.get_scheme() self.top_level_domain = f.get_tld() self.domain = f.get_domain() self.subdomain = f.get_subdomain() self.path = f.get_resource_path()
def initialize(self, stormconf, context): super(Urls, self).initialize(stormconf, context) # Faup self.faup = Faup() # Input bolts for Phishing bolt self.input_bolts = set(context["source->stream->grouping"].keys()) # All mails self._mails = {} # Load keywords self._load_lists()
def test_urls_extractor(self): body = """ bla bla https://tweetdeck.twitter.com/random bla bla http://kafka.apache.org/documentation.html http://kafka.apache.org/documentation1.html bla bla bla https://docs.python.org/2/library/re.html bla bla bla bla bla https://docs.python.org/2/library/re_2.html> bla bla <p>https://tweetdeck.twitter.com/random</p> bla bla <p>https://tweetdeck.twitter.com/random_2</p> """ body_unicode_error = """ Return-Path: <> Delivered-To: [email protected] Received: (qmail 15482 invoked from network); 29 Nov 2015 12:28:40 -000 Received: from unknown (HELO 112.149.154.61) (112.149.154.61) by smtp.customers.net with SMTP; 29 Nov 2015 12:28:40 -0000 Received: from unknown (HELO localhost) ([email protected]@110.68.103.81) by 112.149.154.61 with ESMTPA; Sun, 29 Nov 2015 21:29:24 +0900 From: [email protected] To: [email protected] Subject: Gain your male attrctiveness Give satisfaction to your loved one http://contents.xn--90afavbplfx2a6a5b2a.xn--p1ai/ """ parser = Faup() urls = utils.urls_extractor(parser, body) self.assertIsInstance(urls, dict) self.assertIn("apache.org", urls) self.assertIn("python.org", urls) self.assertIn("twitter.com", urls) for i in ("apache.org", "python.org", "twitter.com"): self.assertIsInstance(urls[i], list) self.assertEqual(len(urls[i]), 2) urls = utils.urls_extractor(parser, body_unicode_error) self.assertIsInstance(urls, dict) self.assertIn("xn--90afavbplfx2a6a5b2a.xn--p1ai", urls) self.assertEqual(len(urls["xn--90afavbplfx2a6a5b2a.xn--p1ai"]), 1)
def is_valid_url(url): cached = _cache_get(url, 'valid') key = date.today().isoformat() + '_submissions' r_cache.zincrby(key, url) if cached is not None: return cached fex = Faup() if url.startswith('hxxp'): url = 'http' + url[4:] elif not url.startswith('http'): url = 'http://' + url logging.debug("Checking validity of URL: " + url) fex.decode(url) scheme = fex.get_scheme() host = fex.get_host() if scheme is None or host is None: reason = "Not a valid http/https URL/URI" return False, url, reason _cache_set(url, (True, url, None), 'valid') return True, url, None
def __init__(self): super(WebStats, self).__init__() # Send module state to logs self.redis_logger.info("Module %s initialized" % (self.module_name)) # Sent to the logging a description of the module self.redis_logger.info("Makes statistics about valid URL") self.pending_seconds = 5 * 60 # REDIS # self.r_serv_trend = redis.StrictRedis( host=self.process.config.get("ARDB_Trending", "host"), port=self.process.config.get("ARDB_Trending", "port"), db=self.process.config.get("ARDB_Trending", "db"), decode_responses=True) # FILE CURVE SECTION # self.csv_path_proto = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolstrending_csv")) self.protocolsfile_path = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolsfile")) self.csv_path_tld = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "tldstrending_csv")) self.tldsfile_path = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "tldsfile")) self.csv_path_domain = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "domainstrending_csv")) self.faup = Faup() self.generate_new_graph = False
def __init__(self): """ Init Web """ super(Web, self).__init__() # REDIS Cache self.r_serv2 = redis.StrictRedis( host=self.process.config.get("Redis_Cache", "host"), port=self.process.config.getint("Redis_Cache", "port"), db=self.process.config.getint("Redis_Cache", "db"), decode_responses=True) # Country to log as critical self.cc_critical = self.process.config.get("Url", "cc_critical") # FUNCTIONS # self.faup = Faup() # Protocol file path protocolsfile_path = os.path.join(os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolsfile")) # Get all uri from protocolsfile (Used for Curve) uri_scheme = "" with open(protocolsfile_path, 'r') as scheme_file: for scheme in scheme_file: uri_scheme += scheme[:-1]+"|" uri_scheme = uri_scheme[:-1] self.url_regex = "((?i:"+uri_scheme + \ ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" self.prec_filename = None # Send module state to logs self.redis_logger.info("Module %s initialized" % (self.module_name))
def sort(self, elem_links, url): fex = Faup() f = Filters() f.load() self.r.switchDB(1) extend = True domainfilter = True schemefilter = True try: for link in elem_links: new_url = link self.r.switchDB(2) if not self.r.get(new_url) and new_url: self.r.switchDB(1) if not self.r.get(new_url): fex.decode(new_url) domain = fex.get_host() if f.isfilteredscheme(fex.get_scheme()): self.r.switchDB(2) self.r.put(new_url, new_url) schemefilter = False if f.isfiltereddomains(domain): self.r.switchDB(2) self.r.put(new_url, new_url) domainfilter = False if f.isfilteredextention(fex.get_resource_path()): extend = False self.r.switchDB(2) self.r.put(new_url, new_url) if extend and domainfilter and schemefilter: self.r.switchDB(1) self.r.rpush('crawl', new_url) self.queue.append(new_url) except TypeError as e: print "TypeError"
REDIS_KEY_NUM_PATH = 'uniqNumForUsername' REDIS_KEY_ALL_CRED_SET = 'AllCredentials' REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev' REDIS_KEY_ALL_PATH_SET = 'AllPath' REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev' REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping' if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Credential" module_name = "Credential" p = Process(config_section) publisher.info("Find credentials") faup = Faup() regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+" regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+" regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" redis_cache_key = regex_helper.generate_redis_cache_key(module_name) while True: message = p.get_from_set() if message is None: publisher.debug("Script Credential is Idling 10s") time.sleep(10) continue
def initialize(self, stormconf, context): super(AbstractUrlsHandlerBolt, self).initialize(stormconf, context) self._load_whitelist() self._parser_faup = Faup()
position = new_position email_data = t_email_data # Refang email data email_data = refang(email_data) ## Extract various IOCs urllist = list() urllist += re.findall(urlmarker.WEB_URL_REGEX, email_data) urllist += re.findall(urlmarker.IP_REGEX, email_data) if debug: syslog.syslog(str(urllist)) # Init Faup f = Faup() # Add tags according to configuration for malware in malwaretags: if malware in email_subject.lower(): for tag in malwaretags[malware]: misp.add_tag(new_event, tag) # Extract and add hashes hashlist_md5 = re.findall(hashmarker.MD5_REGEX, email_data) hashlist_sha1 = re.findall(hashmarker.SHA1_REGEX, email_data) hashlist_sha256 = re.findall(hashmarker.SHA256_REGEX, email_data) for h in hashlist_md5: misp.add_hashes(new_event, md5=h) for h in hashlist_sha1:
class TestPhishing(unittest.TestCase): faup = Faup() def setUp(self): parser = mailparser.parse_from_file(mail_thug) self.email = parser.mail self.attachments = parser.attachments parser = mailparser.parse_from_file(mail_form) self.email_form = parser.mail body = self.email_form.get("body") self.urls = utils.urls_extractor(body, self.faup) d = { "generic": "conf/keywords/targets.example.yml", "custom": "conf/keywords/targets_english.example.yml" } self.targets = utils.load_keywords_dict(d) d = { "generic": "conf/keywords/subjects.example.yml", "custom": "conf/keywords/subjects_english.example.yml" } self.subjects = utils.load_keywords_list(d) def test_ParserError(self): parser = mailparser.parse_from_file(mail_test_6) body = parser.mail.get("body") flag_form = phishing.check_form(body) self.assertFalse(flag_form) def test_none_values(self): email = copy.deepcopy(self.email) email.pop("body", None) email.pop("subjects", None) email.pop("from", None) phishing.check_phishing(email=email, attachments=self.attachments, urls_body=self.urls, urls_attachments=self.urls, target_keys=self.targets, subject_keys=self.subjects) def test_check_form(self): body = self.email_form.get("body") flag_form = phishing.check_form(body) self.assertTrue(flag_form) body = self.email.get("body") flag_form = phishing.check_form(body) self.assertFalse(flag_form) def test_form_value_error(self): parser = mailparser.parse_from_file(mail_test_5) body = parser.mail.get("body") flag_form = phishing.check_form(body) self.assertFalse(flag_form) def test_check_urls(self): flag = False if any( phishing.check_urls(self.urls, i) for i in self.targets.values()): flag = True self.assertTrue(flag) def test_check_phishing(self): results = phishing.check_phishing(email=self.email, attachments=self.attachments, urls_body=self.urls, urls_attachments=self.urls, target_keys=self.targets, subject_keys=self.subjects) self.assertIsInstance(results, dict) self.assertEqual(results["score"], 123) self.assertIn("filename_attachments", results["score_expanded"]) self.assertIn("mail_subject", results["score_expanded"]) self.assertIn("mail_body", results["score_expanded"]) self.assertIn("mail_from", results["score_expanded"]) self.assertIn("urls_body", results["score_expanded"]) self.assertIn("urls_attachments", results["score_expanded"]) self.assertIn("Test", results["targets"]) self.assertTrue(results["with_phishing"]) def test_check_phishing_form(self): results = phishing.check_phishing(email=self.email_form, attachments=self.attachments, urls_body=self.urls, urls_attachments=self.urls, target_keys=self.targets, subject_keys=self.subjects) self.assertIn("mail_form", results["score_expanded"])
#!/usr/bin/python from pyfaup.faup import Faup f = Faup() f = Faup()
uuid = None # The following needs fixes for ExpandedPyMisp for attribs in res_search['response']['Attribute']: uuid = attribs['uuid'] if uuid is not None: print("URL is already present.") # add sighting # if MISP allows to sight on add, we should implement it here, too misp.sighting(uuid=uuid, source="URLabuse") sys.exit(0) # This is obsolete #event = misp.get(misp_id) #existing_event = MISPEvent() #existing_event.load(event) redirect_count = 0 fex = Faup() fex.decode(url) hostname = fex.get_host().lower() screenshot = hostname.decode() + '.png' mispObject = MISPObject('phishing') mispObject.add_attribute('hostname', value=hostname.decode()) for key in response['result']: u = list(key.keys())[0] if redirect_count == 0: comment = "initial URL" mispObject.add_attribute('url', value=u, comment=comment) else: comment = "redirect URL: {}" mispObject.add_attribute('url-redirect', value=u, comment=comment.format(redirect_count))
class TestUtils(unittest.TestCase): faup = Faup() def setUp(self): self.f = utils.reformat_output p = mailparser.parse_from_file(mail) self.mail_obj = p.mail self.mail_obj['analisys_date'] = datetime.datetime.utcnow().isoformat() self.attachments = MailAttachments.withhashes(p.attachments) self.attachments.run() self.parameters = { 'elastic_index_mail': "spamscope_mails-", 'elastic_type_mail': "spamscope", 'elastic_index_attach': "spamscope_attachments-", 'elastic_type_attach': "spamscope" } def test_mail_item(self): mail = utils.MailItem(filename=text_file, mail_server="test_mail_server", mailbox="test_mailbox", priority=1, trust="test_trust", mail_type=1, headers=["header1", "header2"]) self.assertEqual(mail.filename, text_file) self.assertEqual(mail.mail_server, "test_mail_server") self.assertEqual(mail.mailbox, "test_mailbox") self.assertEqual(mail.priority, 1) self.assertEqual(mail.trust, "test_trust") self.assertIsInstance(mail.timestamp, float) self.assertEqual(mail.mail_type, 1) self.assertIsInstance(mail.headers, list) self.assertEqual(mail.headers, ["header1", "header2"]) mail_1 = utils.MailItem(filename=text_file, mail_server="test_mail_server", mailbox="test_mailbox", priority=1, trust="test_trust") mail_2 = utils.MailItem(filename=text_file, mail_server="test_mail_server", mailbox="test_mailbox", priority=2, trust="test_trust") mail_3 = utils.MailItem(filename=text_file, mail_server="test_mail_server", mailbox="test_mailbox", priority=1, trust="test_trust") self.assertTrue(mail_1 < mail_2) self.assertFalse(mail_1 < mail_3) def test_load_conf(self): c = "conf/spamscope.example.yml" conf = utils.load_config(c) self.assertIsInstance(conf, dict) with self.assertRaises(RuntimeError): utils.load_config("conf/fake.yml") def test_write_payload(self): with open(text_file) as f: payload = f.read() sha1_origin = fingerprints(payload).sha1 file_path = utils.write_payload(payload.encode("base64"), ".txt") self.assertEqual(os.path.splitext(file_path)[-1], ".txt") with open(file_path) as f: payload = f.read() sha1_clone = fingerprints(payload).sha1 self.assertEqual(sha1_origin, sha1_clone) self.assertTrue(os.path.exists(file_path)) os.remove(file_path) self.assertFalse(os.path.exists(file_path)) p = mailparser.parse_from_file(mail_test_11) attachments = MailAttachments.withhashes(p.attachments) attachments.run() for i in attachments: temp = utils.write_payload( i["payload"], i["extension"], i["content_transfer_encoding"], ) os.remove(temp) def test_search_words_in_text(self): with open(text_file) as f: text = f.read() keywords_1 = ["nomatch", "nomatch"] self.assertEqual(utils.search_words_in_text(text, keywords_1), False) keywords_2 = ["nomatch", "nomatch", "theophrastus rationibus"] self.assertEqual(utils.search_words_in_text(text, keywords_2), True) keywords_3 = ["nomatch", "theophrastus nomatch"] self.assertEqual(utils.search_words_in_text(text, keywords_3), False) keywords_4 = ["theophrastus quo vidit"] self.assertEqual(utils.search_words_in_text(text, keywords_4), True) keywords_5 = [12345678] self.assertEqual(utils.search_words_in_text(text, keywords_5), True) keywords_6 = [11111, 44444] self.assertEqual(utils.search_words_in_text(text, keywords_6), True) def test_reformat_output_first(self): with self.assertRaises(RuntimeError): self.f(mail=self.mail_obj) with self.assertRaises(KeyError): self.f(mail=self.mail_obj, bolt="output-elasticsearch") m, a = self.f(mail=self.mail_obj, bolt="output-elasticsearch", **self.parameters) # Attachments self.assertIsInstance(a, list) self.assertEqual(len(a), 1) self.assertIsInstance(a[0], dict) self.assertIn('@timestamp', m) self.assertIn('_index', a[0]) self.assertIn('_type', a[0]) self.assertIn('type', a[0]) # Mail self.assertIsInstance(m, dict) self.assertIn('@timestamp', m) self.assertIn('_index', m) self.assertIn('_type', m) self.assertIn('type', m) def test_reformat_output_second(self): m = copy.deepcopy(self.mail_obj) m['attachments'] = list(self.attachments) m, a = self.f(mail=m, bolt="output-elasticsearch", **self.parameters) # Attachments self.assertIsInstance(a, list) self.assertEqual(len(a), 2) self.assertIsInstance(a[0], dict) self.assertIn('@timestamp', a[0]) self.assertIn('_index', a[0]) self.assertIn('_type', a[0]) self.assertIn('type', a[0]) self.assertIn('payload', a[0]) self.assertEqual(a[0]['is_archived'], True) self.assertIsInstance(a[1], dict) self.assertIn('@timestamp', a[1]) self.assertIn('_index', a[1]) self.assertIn('_type', a[1]) self.assertIn('type', a[1]) self.assertIn('files', a[1]) self.assertIn('payload', a[1]) # self.assertIn('tika', a[1]) self.assertNotIn('payload', a[1]['files'][0]) self.assertEqual(a[1]['is_archived'], False) self.assertEqual(a[1]['is_archive'], True) # Mail self.assertIsInstance(m, dict) self.assertIn('@timestamp', m) def test_reformat_output_third(self): m = copy.deepcopy(self.mail_obj) m['attachments'] = list(self.attachments) m, a = self.f(mail=m, bolt="output-redis") # Attachments self.assertIsInstance(a, list) self.assertEqual(len(a), 2) self.assertIsInstance(a[0], dict) self.assertNotIn('@timestamp', a[0]) self.assertNotIn('_index', a[0]) self.assertNotIn('_type', a[0]) self.assertNotIn('type', a[0]) self.assertIn('payload', a[0]) self.assertEqual(a[0]['is_archived'], True) self.assertIsInstance(a[1], dict) self.assertNotIn('@timestamp', a[1]) self.assertNotIn('_index', a[1]) self.assertNotIn('_type', a[1]) self.assertNotIn('type', a[1]) self.assertIn('files', a[1]) self.assertIn('payload', a[1]) # self.assertIn('tika', a[1]) self.assertNotIn('payload', a[1]['files'][0]) self.assertEqual(a[1]['is_archived'], False) self.assertEqual(a[1]['is_archive'], True) # Mail self.assertIsInstance(m, dict) self.assertNotIn('@timestamp', m) self.assertNotIn('_index', m) self.assertNotIn('_type', m) self.assertNotIn('type', m) def test_load_keywords_list(self): d = { "generic": "conf/keywords/subjects.example.yml", "custom": "conf/keywords/subjects_english.example.yml" } results = utils.load_keywords_list(d) self.assertIsInstance(results, set) self.assertIn("fattura", results) self.assertIn("conferma", results) self.assertIn("123456", results) self.assertNotIn(123456, results) with self.assertRaises(RuntimeError): d = {"generic": "conf/keywords/targets.example.yml"} results = utils.load_keywords_list(d) def test_load_keywords_dict(self): d = { "generic": "conf/keywords/targets.example.yml", "custom": "conf/keywords/targets_english.example.yml" } results = utils.load_keywords_dict(d) self.assertIsInstance(results, dict) self.assertIn("Banca Tizio", results) self.assertNotIn("banca tizio", results) self.assertIn("tizio", results["Banca Tizio"]) self.assertIn("caio rossi", results["Banca Tizio"]) self.assertNotIn(12345, results["Banca Tizio"]) self.assertIn("12345", results["Banca Tizio"]) self.assertNotIn("123", results["Banca Tizio"]) self.assertNotIn(123, results["Banca Tizio"]) self.assertIn("123 456", results["Banca Tizio"]) with self.assertRaises(RuntimeError): d = {"generic": "conf/keywords/subjects.example.yml"} results = utils.load_keywords_dict(d) def test_urls_extractor(self): body = """ bla bla https://tweetdeck.twitter.com/random bla bla http://kafka.apache.org/documentation.html http://kafka.apache.org/documentation1.html bla bla bla https://docs.python.org/2/library/re.html bla bla bla bla bla https://docs.python.org/2/library/re_2.html> bla bla <p>https://tweetdeck.twitter.com/random</p> bla bla <p>https://tweetdeck.twitter.com/random_2</p> """ body_unicode_error = """ Return-Path: <> Delivered-To: [email protected] Received: (qmail 15482 invoked from network); 29 Nov 2015 12:28:40 -000 Received: from unknown (HELO 112.149.154.61) (112.149.154.61) by smtp.customers.net with SMTP; 29 Nov 2015 12:28:40 -0000 Received: from unknown (HELO localhost) ([email protected]@110.68.103.81) by 112.149.154.61 with ESMTPA; Sun, 29 Nov 2015 21:29:24 +0900 From: [email protected] To: [email protected] Subject: Gain your male attrctiveness Give satisfaction to your loved one http://contents.xn--90afavbplfx2a6a5b2a.xn--p1ai/ """ urls = utils.urls_extractor(body, self.faup) self.assertIsInstance(urls, dict) self.assertIn("apache.org", urls) self.assertIn("python.org", urls) self.assertIn("twitter.com", urls) for i in ("apache.org", "python.org", "twitter.com"): self.assertIsInstance(urls[i], list) self.assertEqual(len(urls[i]), 2) urls = utils.urls_extractor(body_unicode_error, self.faup) self.assertIsInstance(urls, dict) self.assertIn("xn--90afavbplfx2a6a5b2a.xn--p1ai", urls) self.assertEqual(len(urls["xn--90afavbplfx2a6a5b2a.xn--p1ai"]), 1) def test_load_whitelist(self): d = {"generic": {"path": "conf/whitelists/generic.example.yml"}} results = utils.load_whitelist(d) self.assertIsInstance(results, set) self.assertIn("google.com", results) self.assertIn("amazon.com", results) self.assertIn("facebook.com", results) d = { "generic": { "path": "conf/whitelists/generic.example.yml", "expiry": None } } results = utils.load_whitelist(d) self.assertIsInstance(results, set) self.assertIn("google.com", results) self.assertIn("amazon.com", results) self.assertIn("facebook.com", results) d = { "generic": { "path": "conf/whitelists/generic.example.yml", "expiry": "2016-06-28T12:33:00.000Z" } } results = utils.load_whitelist(d) self.assertIsInstance(results, set) self.assertEqual(len(results), 0) def test_text2urls_whitelisted(self): body = """ bla bla https://tweetdeck.twitter.com/random bla bla http://kafka.apache.org/documentation.html http://kafka.apache.org/documentation1.html bla bla bla https://docs.python.org/2/library/re.html bla bla bla bla bla https://docs.python.org/2/library/re_2.html> bla bla <p>https://tweetdeck.twitter.com/random</p> bla bla <p>https://tweetdeck.twitter.com/random_2</p> """ d = {"generic": {"path": "conf/whitelists/generic.example.yml"}} whitelist = utils.load_whitelist(d) urls = utils.text2urls_whitelisted(body, whitelist, self.faup) self.assertIsInstance(urls, dict) self.assertNotIn("apache.org", urls) self.assertIn("python.org", urls) self.assertIsInstance(urls["python.org"], list) self.assertIn("twitter.com", urls) self.assertIsInstance(urls["twitter.com"], list) def test_text2urls_whitelisted_nonetype_error(self): p = mailparser.parse_from_file(mail_test_7) body = p.body urls = utils.urls_extractor(body, self.faup) for k in urls: self.assertIsNotNone(k) d = {"generic": {"path": "conf/whitelists/generic.example.yml"}} whitelist = utils.load_whitelist(d) utils.text2urls_whitelisted(body, whitelist, self.faup) def test_reformat_urls(self): body = """ bla bla https://tweetdeck.twitter.com/random bla bla http://kafka.apache.org/documentation.html http://kafka.apache.org/documentation1.html bla bla bla https://docs.python.org/2/library/re.html bla bla bla bla bla https://docs.python.org/2/library/re_2.html> bla bla <p>https://tweetdeck.twitter.com/random</p> bla bla <p>https://tweetdeck.twitter.com/random_2</p> """ d = {"generic": {"path": "conf/whitelists/generic.example.yml"}} whitelist = utils.load_whitelist(d) urls = utils.text2urls_whitelisted(body, whitelist, self.faup) self.assertIsInstance(urls, dict) urls = utils.reformat_urls(urls) self.assertIsInstance(urls, list) with self.assertRaises(TypeError): utils.reformat_urls(dict) def test_timeout(self): with self.assertRaises(utils.TimeoutError): sleeping() def test_register_order(self): register = utils.register processors = set() @register(processors, priority=2) def number_two(): pass @register(processors, priority=1) def number_one(): pass @register(processors, priority=4) def number_four(): pass @register(processors, priority=3) def number_three(): pass processors = [i[0] for i in sorted(processors, key=itemgetter(1))] self.assertIs(processors[0], number_one) self.assertIs(processors[1], number_two) self.assertIs(processors[2], number_three) self.assertIs(processors[3], number_four) def test_is_file_older_than(self): r = utils.is_file_older_than(text_file, seconds=20) self.assertTrue(r) r = utils.is_file_older_than(text_file, seconds=3153600000) self.assertFalse(r) def test_dump_load(self): path = "/tmp/object.dump" d = deque(maxlen=5) d.append(1) d.append(2) self.assertIsInstance(d, deque) utils.dump_obj(path, d) d_dumped = utils.load_obj(path) self.assertIsInstance(d_dumped, deque) self.assertEqual(d, d_dumped)
def __init__(self, loglevel: int = logging.DEBUG): self.__init_logger(loglevel) self.fex = Faup() self.cache = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
def __init__(self): self._url_regex = re.compile( r'((?:(?:ht|f)tp(?:s?)\:\/\/)' r'(?:[!#$&-;=?-\[\]_a-z~]|%[0-9a-f]{2})+)', re.I) self._faup = Faup()
def get_port(self): f = Faup() f.decode(self.url) return f.get_port()