Exemple #1
0
def parseURLs(metas):
    new_metas = copy.deepcopy(metas)
    href = re.compile('(href|HREF|src|SRC|title)=(3D)?')
    schema = re.compile('xmlns(:\w)?=')
    quote = re.compile('[\'\"]')
    tag = re.compile('[<>]')
    for cpage in metas:
        msg = metas[cpage]["msg"].single()
        for part in msg.walk():
            ctype = part.get_content_type()
            if (ctype == "text/plain") or (ctype == "text/html"):
                content = part.get_payload(decode=True)
                tokens = content.split()
                for token in tokens:
                    if schema.search(token):
                        pass
                    if url_all_re.search(token):
                        match = fqdn_re.search(token)
                        rr = match.group()
                        type = getRrType(rr)
                        new_metas[cpage]["SPAM RR"].add('[[%s]]' %
                                                        match.group())
                        new_metas[rr]["TYPE"].add(type)
                        token = href.sub(' ', token)
                        token = quote.sub(' ', token)
                        token = tag.sub(' ', token)
                        url = token.split()
                        for i in url:
                            if url_all_re.search(i):
                                new_metas[cpage]["SPAM URL"].add(i)
    return new_metas
Exemple #2
0
def parseURLs(metas):
    new_metas = copy.deepcopy(metas)
    href = re.compile('(href|HREF|src|SRC|title)=(3D)?')
    schema = re.compile('xmlns(:\w)?=')
    quote = re.compile('[\'\"]')
    tag = re.compile('[<>]')
    for cpage in metas:
        msg = metas[cpage]["msg"].single()
        for part in msg.walk():
            ctype = part.get_content_type()
            if(ctype == "text/plain") or (ctype == "text/html"):
                content = part.get_payload(decode=True)
                tokens = content.split()
                for token in tokens:
                    if schema.search(token):
                        pass
                    if url_all_re.search(token):
                        match = fqdn_re.search(token)
                        rr = match.group()
                        type = getRrType(rr)
                        new_metas[cpage]["SPAM RR"].add('[[%s]]' % match.group())
                        new_metas[rr]["TYPE"].add(type)
                        token = href.sub(' ', token)
                        token = quote.sub(' ', token)
                        token = tag.sub(' ', token)
                        url = token.split()
                        for i in url:
                            if url_all_re.search(i):
                                new_metas[cpage]["SPAM URL"].add(i)
    return new_metas
Exemple #3
0
def lexifyTokens(metas):
    quotes = re.compile('(^[\"\']|[\"\']$)')
    markup = re.compile('[\#<>\[\]\(\)\{\}]')
    punct = re.compile('[\.,:;]\s?$')
    rest = re.compile('[\x12\xab\xbb]')
    new_metas = copy.deepcopy(metas)
    for cpage in metas:
        for text in metas[cpage]["text"]:
            shred = []
            shred = text.split()
            for token in shred:
                if url_all_re.search(token):
                    pass
                else:
                    token = quotes.sub('', token)
                    token = markup.sub('', token)
                    token = punct.sub('', token)
                    token = rest.sub('', token)
                    token = token.lower()
                    new_metas[cpage]["Lexeme"].add("[[%s]]" % token)
                    # Scalability issues. :)
                    # if token:
                    #     new_metas[token]["TYPE"].add("LEXEME")
    return new_metas
Exemple #4
0
def lexifyTokens(metas):
    quotes = re.compile('(^[\"\']|[\"\']$)')
    markup = re.compile('[\#<>\[\]\(\)\{\}]')
    punct = re.compile('[\.,:;]\s?$')
    rest = re.compile('[\x12\xab\xbb]')
    new_metas = copy.deepcopy(metas)
    for cpage in metas:
        for text in metas[cpage]["text"]:
            shred = []
            shred = text.split()
            for token in shred:
                if url_all_re.search(token):
                    pass
                else:
                    token = quotes.sub('', token)
                    token = markup.sub('', token)
                    token = punct.sub('', token)
                    token = rest.sub('', token)
                    token = token.lower()
                    new_metas[cpage]["Lexeme"].add("[[%s]]" % token)
                    # Scalability issues. :)
                    # if token:
                    #     new_metas[token]["TYPE"].add("LEXEME")
    return new_metas