Beispiel #1
0
 def get_feat(self, msg):
     words = [
         'link', 'click', 'confirm', 'user', 'customer', 'client',
         'suspend', 'restrict', 'verify', 'protect'
     ]
     clean_text = utils.get_clean_text(msg).lower()
     counter = 0
     for word in words:
         counter += clean_text.count(word)
     return counter
def mboxText2DF(filepath, Phishy, limit=5000):
    print("Processing file: " + filepath)
    mbox = mailbox.mbox(filepath, factory=mbox_reader)
    #mbox = mailbox.mbox(filepath)
    email_index = []
    finders = [
        NURLs(),
        encoding(),
        nparts(),
        hasHTML(),
        attachments(),
        badwords(),
        ipurls(),
        diffhref(),
        forms(),
        scripts(),
        ndots(),
        nports(),
        nrecs(),
        checkdomains(),
        subject_badwords(),
        script_parts(),
        distinct_words(),
        char_count(),
        word_count(),
        richness(),
        RE_presence(),
        link_images(),
        named_urls(),
        year()
    ]

    i = 1
    for message in mbox:
        #    input(str(i) + "ENTER FOR NEXT") #For testing
        if (not utils.is_empty(message)):
            #            print("    NEW MESSAGE")

            email_clean_text = utils.get_clean_text(message)
            feats = [finder.get_feat(message) for finder in finders]
            email_index.append([i, Phishy, email_clean_text] + feats)

            #            email_index.append((i, Phishy, email_clean_text))
            #        print(email_text) #For testing
            #        print(i)
            i += 1
            if i > limit: break
        else: print("EMPTY EMAIL - Moving to next email...")

#    emailDF = spark.createDataFrame(email_index,('id', 'label', 'emailText'))
    emailDF = spark.createDataFrame(email_index, ['id', 'label', 'emailText'] +
                                    [finder.get_name() for finder in finders])
    emailDF = utils.textDF2setDF(emailDF, "emailText")
    emailDF = emailDF.drop('emailText', 'words', 'stopWremoved')
    return emailDF
Beispiel #3
0
def spider_full_content(id) -> list:
    """
    GET FULL CONTENT OF THE WEIBO
    """
    weibo_detail_url = f'https://m.weibo.cn/statuses/extend?id={id}'
    kv = {'user-agent': 'Mozilla/5.0'}
    try:
        r = s.get(url=weibo_detail_url, headers=kv)
        r.raise_for_status()
    except:
        print('爬取信息失败')
        return
    r_json = json.loads(r.text)
    weibo_full_content = r_json['data']['longTextContent']
    clean_content = utils.get_clean_text(weibo_full_content)

    return [weibo_full_content, clean_content]
    if (len(msg) > 0):
        f = open(filepath, "a")
        f.write(msg)
        f.write("\n")
        f.close()


for root, dirs, files in os.walk("phishing_datasets"):
    pass

for file in files:
    mbox = mailbox.mbox(root + "/" + file, factory=SpyderTest.mbox_reader)
    print("Processing file: " + root + "/" + file)
    for message in mbox:
        if (not utils.is_empty(message)):
            clean_text = utils.get_clean_text(message)
            #            clean_text = clean_text.replace("\n"," ")
            #            clean_text = clean_text.replace("\t"," ")
            #            clean_text = re.sub(' +',' ',clean_text)
            nltk_tokens = nltk.word_tokenize(clean_text)
            clean_text = ""
            for term in nltk_tokens:
                if ("'" not in term):
                    clean_text += " "
                clean_text += term
#            clean_text = " ".join(nltk_tokens)
            if (SpyderTest.year().get_feat(message) == "2015"):
                append_file("phishing_date_text/phishing2015_text.txt",
                            clean_text)
            elif (SpyderTest.year().get_feat(message) == "2016"):
                append_file("phishing_date_text/phishing2016_text.txt",
 def get_feat(self, msg):
     email_text = utils.get_clean_text(msg)
     words = email_text.split()
     distinct_words = set(words)
     return len(distinct_words)
 def get_feat(self, msg):
     email_text = utils.get_clean_text(msg)
     word_quantity = len(email_text.split())
     char_quantity = sum([len(x) for x in email_text.split()])
     richness = word_quantity / (char_quantity + 1)
     return richness
 def get_feat(self, msg):
     email_text = utils.get_clean_text(msg)
     words = email_text.split()
     len_words = [len(x) for x in words]
     return sum(len_words)
 def get_feat(self, msg):
     return len(utils.get_clean_text(msg).split())