def get_feat(self, msg): words = [ 'link', 'click', 'confirm', 'user', 'customer', 'client', 'suspend', 'restrict', 'verify', 'protect' ] clean_text = utils.get_clean_text(msg).lower() counter = 0 for word in words: counter += clean_text.count(word) return counter
def mboxText2DF(filepath, Phishy, limit=5000): print("Processing file: " + filepath) mbox = mailbox.mbox(filepath, factory=mbox_reader) #mbox = mailbox.mbox(filepath) email_index = [] finders = [ NURLs(), encoding(), nparts(), hasHTML(), attachments(), badwords(), ipurls(), diffhref(), forms(), scripts(), ndots(), nports(), nrecs(), checkdomains(), subject_badwords(), script_parts(), distinct_words(), char_count(), word_count(), richness(), RE_presence(), link_images(), named_urls(), year() ] i = 1 for message in mbox: # input(str(i) + "ENTER FOR NEXT") #For testing if (not utils.is_empty(message)): # print(" NEW MESSAGE") email_clean_text = utils.get_clean_text(message) feats = [finder.get_feat(message) for finder in finders] email_index.append([i, Phishy, email_clean_text] + feats) # email_index.append((i, Phishy, email_clean_text)) # print(email_text) #For testing # print(i) i += 1 if i > limit: break else: print("EMPTY EMAIL - Moving to next email...") # emailDF = spark.createDataFrame(email_index,('id', 'label', 'emailText')) emailDF = spark.createDataFrame(email_index, ['id', 'label', 'emailText'] + [finder.get_name() for finder in finders]) emailDF = utils.textDF2setDF(emailDF, "emailText") emailDF = emailDF.drop('emailText', 'words', 'stopWremoved') return emailDF
def spider_full_content(id) -> list: """ GET FULL CONTENT OF THE WEIBO """ weibo_detail_url = f'https://m.weibo.cn/statuses/extend?id={id}' kv = {'user-agent': 'Mozilla/5.0'} try: r = s.get(url=weibo_detail_url, headers=kv) r.raise_for_status() except: print('爬取信息失败') return r_json = json.loads(r.text) weibo_full_content = r_json['data']['longTextContent'] clean_content = utils.get_clean_text(weibo_full_content) return [weibo_full_content, clean_content]
if (len(msg) > 0): f = open(filepath, "a") f.write(msg) f.write("\n") f.close() for root, dirs, files in os.walk("phishing_datasets"): pass for file in files: mbox = mailbox.mbox(root + "/" + file, factory=SpyderTest.mbox_reader) print("Processing file: " + root + "/" + file) for message in mbox: if (not utils.is_empty(message)): clean_text = utils.get_clean_text(message) # clean_text = clean_text.replace("\n"," ") # clean_text = clean_text.replace("\t"," ") # clean_text = re.sub(' +',' ',clean_text) nltk_tokens = nltk.word_tokenize(clean_text) clean_text = "" for term in nltk_tokens: if ("'" not in term): clean_text += " " clean_text += term # clean_text = " ".join(nltk_tokens) if (SpyderTest.year().get_feat(message) == "2015"): append_file("phishing_date_text/phishing2015_text.txt", clean_text) elif (SpyderTest.year().get_feat(message) == "2016"): append_file("phishing_date_text/phishing2016_text.txt",
def get_feat(self, msg): email_text = utils.get_clean_text(msg) words = email_text.split() distinct_words = set(words) return len(distinct_words)
def get_feat(self, msg): email_text = utils.get_clean_text(msg) word_quantity = len(email_text.split()) char_quantity = sum([len(x) for x in email_text.split()]) richness = word_quantity / (char_quantity + 1) return richness
def get_feat(self, msg): email_text = utils.get_clean_text(msg) words = email_text.split() len_words = [len(x) for x in words] return sum(len_words)
def get_feat(self, msg): return len(utils.get_clean_text(msg).split())