def parseURLs(metas): new_metas = copy.deepcopy(metas) href = re.compile('(href|HREF|src|SRC|title)=(3D)?') schema = re.compile('xmlns(:\w)?=') quote = re.compile('[\'\"]') tag = re.compile('[<>]') for cpage in metas: msg = metas[cpage]["msg"].single() for part in msg.walk(): ctype = part.get_content_type() if (ctype == "text/plain") or (ctype == "text/html"): content = part.get_payload(decode=True) tokens = content.split() for token in tokens: if schema.search(token): pass if url_all_re.search(token): match = fqdn_re.search(token) rr = match.group() type = getRrType(rr) new_metas[cpage]["SPAM RR"].add('[[%s]]' % match.group()) new_metas[rr]["TYPE"].add(type) token = href.sub(' ', token) token = quote.sub(' ', token) token = tag.sub(' ', token) url = token.split() for i in url: if url_all_re.search(i): new_metas[cpage]["SPAM URL"].add(i) return new_metas
def parseURLs(metas): new_metas = copy.deepcopy(metas) href = re.compile('(href|HREF|src|SRC|title)=(3D)?') schema = re.compile('xmlns(:\w)?=') quote = re.compile('[\'\"]') tag = re.compile('[<>]') for cpage in metas: msg = metas[cpage]["msg"].single() for part in msg.walk(): ctype = part.get_content_type() if(ctype == "text/plain") or (ctype == "text/html"): content = part.get_payload(decode=True) tokens = content.split() for token in tokens: if schema.search(token): pass if url_all_re.search(token): match = fqdn_re.search(token) rr = match.group() type = getRrType(rr) new_metas[cpage]["SPAM RR"].add('[[%s]]' % match.group()) new_metas[rr]["TYPE"].add(type) token = href.sub(' ', token) token = quote.sub(' ', token) token = tag.sub(' ', token) url = token.split() for i in url: if url_all_re.search(i): new_metas[cpage]["SPAM URL"].add(i) return new_metas
def lexifyTokens(metas): quotes = re.compile('(^[\"\']|[\"\']$)') markup = re.compile('[\#<>\[\]\(\)\{\}]') punct = re.compile('[\.,:;]\s?$') rest = re.compile('[\x12\xab\xbb]') new_metas = copy.deepcopy(metas) for cpage in metas: for text in metas[cpage]["text"]: shred = [] shred = text.split() for token in shred: if url_all_re.search(token): pass else: token = quotes.sub('', token) token = markup.sub('', token) token = punct.sub('', token) token = rest.sub('', token) token = token.lower() new_metas[cpage]["Lexeme"].add("[[%s]]" % token) # Scalability issues. :) # if token: # new_metas[token]["TYPE"].add("LEXEME") return new_metas
def lexifyTokens(metas): quotes = re.compile('(^[\"\']|[\"\']$)') markup = re.compile('[\#<>\[\]\(\)\{\}]') punct = re.compile('[\.,:;]\s?$') rest = re.compile('[\x12\xab\xbb]') new_metas = copy.deepcopy(metas) for cpage in metas: for text in metas[cpage]["text"]: shred = [] shred = text.split() for token in shred: if url_all_re.search(token): pass else: token = quotes.sub('', token) token = markup.sub('', token) token = punct.sub('', token) token = rest.sub('', token) token = token.lower() new_metas[cpage]["Lexeme"].add("[[%s]]" % token) # Scalability issues. :) # if token: # new_metas[token]["TYPE"].add("LEXEME") return new_metas