Esempio n. 1
0
 def _digestAuthentication(self, login, method):
     """Performs a Digest HTTP authentication. Returns True when the user
     has logged in successfully, False otherwise."""
     def stripQuotes(s):
         return (s[0] == '"' and s[-1] == '"') and s[1:-1] or s
     options  = dict(self._login_splitter.findall(login))
     userName = stripQuotes(options["username"])
     password = self._server.getPasswordForUser(userName)
     nonce    = stripQuotes(options["nonce"])
     A1  = "%s:%s:%s" % (userName, self._server.getRealm(), password)
     HA1 = md5(A1).hexdigest()
     A2  = "%s:%s" % (method, stripQuotes(options["uri"]))
     HA2 = md5(A2).hexdigest()
     unhashedDigest = ""
     if "qop" in options:
         if not options["nc"]:
             options["nc"] = "00000001"
         if not options["qop"]:
             options["qop"] = "auth"
         unhashedDigest = "%s:%s:%s:%s:%s:%s" % \
                         (HA1, nonce,
                          stripQuotes(options["nc"]),
                          stripQuotes(options["cnonce"]),
                          stripQuotes(options["qop"]), HA2)
     else:
         unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2)
     hashedDigest = md5(unhashedDigest).hexdigest()
     return (stripQuotes(options["response"]) == hashedDigest and
             self._isValidNonce(nonce))
Esempio n. 2
0
 def structure(self, ext=False):
     """Body structure data describes the MIME-IMB
     format of a message and consists of a sequence of mime type, mime
     subtype, parameters, content id, description, encoding, and size.
     The fields following the size field are variable: if the mime
     type/subtype is message/rfc822, the contained message's envelope
     information, body structure data, and number of lines of text; if
     the mime type is text, the number of lines of text.  Extension fields
     may also be included; if present, they are: the MD5 hash of the body,
     body disposition, body language."""
     s = []
     for part in self.walk():
         if part.get_content_charset() is not None:
             charset = ("charset", part.get_content_charset())
         else:
             charset = None
         part_s = [part.get_main_type(), part.get_subtype(),
                   charset,
                   part.get('Content-Id'),
                   part.get('Content-Description'),
                   part.get('Content-Transfer-Encoding'),
                   str(len(part.as_string()))]
         if part.get_main_type() == "text":
             part_s.append(str(part.as_string().count("\n")))
         if ext:
             part_s.extend([md5(part.as_string()).digest(),
                            part.get('Content-Disposition'),
                            part.get('Content-Language')])
         s.append(part_s)
     if len(s) == 1:
         return s[0]
     return s
Esempio n. 3
0
def generate_checksum(msg):
    data = flatten(msg)

    # modelled after Justin Mason's fuzzy checksummer for SpamAssassin.
    # Message body is cleaned, then broken into lines.  The list of lines is
    # then broken into four parts and separate checksums are generated for
    # each part.  They are then joined together with '.'.  Downstream
    # processes can split those chunks into pieces and consider them
    # separately or in various combinations if desired.

    # Get rid of anything which looks like an HTML tag and downcase it all
    data = re.sub(r"<[^>]*>", "", data).lower()

    # delete anything which looks like a url or email address
    # not sure what a pmguid: url is but it seems to occur frequently in spam
    words = [w for w in data.split(' ')
             if ('@' not in w and
                 (':' not in w or
                  w[:4] != "ftp:" and
                  w[:7] != "mailto:" and
                  w[:5] != "http:" and
                  w[:7] != "gopher:" and
                  w[:8] != "pmguid:"))]

    # delete lines which contain white space
    lines = [line for line in " ".join(words).split('\n') if ' ' in line]

    # +1 guarantees we don't miss lines at the end
    chunksize = len(lines)//4+1
    sum = []
    for i in range(4):
        chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
        sum.append(binascii.b2a_hex(md5(chunk).digest()))

    return ".".join(sum)
Esempio n. 4
0
 def generate_checksum(msg):

    fp = io.StringIO()

    g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60)

    g.flatten(msg)

    text = fp.getvalue()

    body = text.split("\n\n", 1)[1]

    lines = clean(body).split("\n")

    chunksize = len(lines)//4+1

    digest = []

    for i in range(4):

        chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])

        digest.append(md5(chunk).hexdigest())

    return ".".join(digest)
Esempio n. 5
0
 def generate_checksum(msg):

    data = flatten(msg)

    data = re.sub(r"<[^>]*>", "", data).lower()

    words = [w for w in data.split(' ')
             if ('@' not in w and
                 (':' not in w or
                  w[:4] != "ftp:" and
                  w[:7] != "mailto:" and
                  w[:5] != "http:" and
                  w[:7] != "gopher:" and
                  w[:8] != "pmguid:"))]

    lines = [line for line in " ".join(words).split('\n') if ' ' in line]

    chunksize = len(lines)//4+1

    sum = []

    for i in range(4):

        chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])

        sum.append(binascii.b2a_hex(md5(chunk).digest()))

    return ".".join(sum)
Esempio n. 6
0
 def extract_ocr_info(self, pnmfiles):
     assert self.engine, "must have an engine!"
     textbits = []
     tokens = set()
     for pnmfile in pnmfiles:
         preserve = False
         fhash = md5(open(pnmfile).read()).hexdigest()
         if fhash in self.cache:
             self.hits += 1
             ctext, ctokens = self.cache[fhash]
         else:
             self.misses += 1
             if self.engine.program:
                 try:
                     ctext = self.engine.extract_text(pnmfile).lower()
                 except SystemError, msg:
                     print >> sys.stderr, msg
                     preserve = True
                     ctext = ""
             else:
                 print >> sys.stderr, \
                       "No OCR program '%s' available - can't get text!" \
                       % (self.engine.engine_name,)
                 ctext = ""
             ctokens = set()
             if not ctext.strip():
                 ctokens.add("image-text:no text found")
             else:
                 nlines = len(ctext.strip().split("\n"))
                 if nlines:
                     ctokens.add("image-text-lines:%d" % int(log2(nlines)))
             self.cache[fhash] = (ctext, ctokens)
    def _digestAuthentication(self, login, method):
        """Performs a Digest HTTP authentication. Returns True when the user
        has logged in successfully, False otherwise."""

        def stripQuotes(s):
            return (s[0] == '"' and s[-1] == '"') and s[1:-1] or s

        options = dict(self._login_splitter.findall(login))
        userName = stripQuotes(options["username"])
        password = self._server.getPasswordForUser(userName)
        nonce = stripQuotes(options["nonce"])

        # The following computations are based upon RFC 2617.
        A1 = "%s:%s:%s" % (userName, self._server.getRealm(), password)
        HA1 = md5(A1).hexdigest()
        A2 = "%s:%s" % (method, stripQuotes(options["uri"]))
        HA2 = md5(A2).hexdigest()

        unhashedDigest = ""
        if options.has_key("qop"):
            # IE 6.0 doesn't give nc back correctly?
            if not options["nc"]:
                options["nc"] = "00000001"
            # Firefox 1.0 doesn't give qop back correctly?
            if not options["qop"]:
                options["qop"] = "auth"
            unhashedDigest = "%s:%s:%s:%s:%s:%s" % (
                HA1,
                nonce,
                stripQuotes(options["nc"]),
                stripQuotes(options["cnonce"]),
                stripQuotes(options["qop"]),
                HA2,
            )
        else:
            unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2)
        hashedDigest = md5(unhashedDigest).hexdigest()

        return stripQuotes(options["response"]) == hashedDigest and self._isValidNonce(nonce)
Esempio n. 8
0
    def _digestAuthentication(self, login, method):
        """Performs a Digest HTTP authentication. Returns True when the user
        has logged in successfully, False otherwise."""
        def stripQuotes(s):
            return (s[0] == '"' and s[-1] == '"') and s[1:-1] or s

        options = dict(self._login_splitter.findall(login))
        userName = stripQuotes(options["username"])
        password = self._server.getPasswordForUser(userName)
        nonce = stripQuotes(options["nonce"])

        # The following computations are based upon RFC 2617.
        A1 = "%s:%s:%s" % (userName, self._server.getRealm(), password)
        HA1 = md5(A1).hexdigest()
        A2 = "%s:%s" % (method, stripQuotes(options["uri"]))
        HA2 = md5(A2).hexdigest()

        unhashedDigest = ""
        if options.has_key("qop"):
            # IE 6.0 doesn't give nc back correctly?
            if not options["nc"]:
                options["nc"] = "00000001"
            # Firefox 1.0 doesn't give qop back correctly?
            if not options["qop"]:
                options["qop"] = "auth"
            unhashedDigest = "%s:%s:%s:%s:%s:%s" % \
                            (HA1, nonce,
                             stripQuotes(options["nc"]),
                             stripQuotes(options["cnonce"]),
                             stripQuotes(options["qop"]), HA2)
        else:
            unhashedDigest = "%s:%s:%s" % (HA1, nonce, HA2)
        hashedDigest = md5(unhashedDigest).hexdigest()

        return (stripQuotes(options["response"]) == hashedDigest
                and self._isValidNonce(nonce))
Esempio n. 9
0
    def extract_ocr_info(self, pnmfiles):
        assert self.engine, "must have an engine!"
        textbits = []
        tokens = set()
        for pnmfile in pnmfiles:
            preserve = False
            fhash = md5(open(pnmfile).read()).hexdigest()
            if fhash in self.cache:
                self.hits += 1
                ctext, ctokens = self.cache[fhash]
            else:
                self.misses += 1
                if self.engine.program:
                    try:
                        ctext = self.engine.extract_text(pnmfile).lower()
                    except SystemError as msg:
                        print(msg, file=sys.stderr)
                        preserve = True
                        ctext = ""
                else:
                    # We should not get here if no OCR is enabled.  If it
                    # is enabled and we have no program, its OK to spew lots
                    # of warnings - they should either disable OCR (it is by
                    # default), or fix their config.
                    print("No OCR program '%s' available - can't get text!" \
                          % (self.engine.engine_name,), file=sys.stderr)
                    ctext = ""
                ctokens = set()
                if not ctext.strip():
                    # Lots of spam now contains images in which it is
                    # difficult or impossible (using ocrad) to find any
                    # text.  Make a note of that.
                    ctokens.add("image-text:no text found")
                else:
                    nlines = len(ctext.strip().split("\n"))
                    if nlines:
                        ctokens.add("image-text-lines:%d" % int(log2(nlines)))
                self.cache[fhash] = (ctext, ctokens)
            textbits.append(ctext)
            tokens |= ctokens
            if not preserve:
                os.unlink(pnmfile)

        return "\n".join(textbits), tokens
Esempio n. 10
0
def generate_checksum(msg):
    # modelled after Justin Mason's fuzzy checksummer for SpamAssassin.
    # Message body is cleaned, then broken into lines.  The list of lines is
    # then broken into four parts and separate checksums are generated for
    # each part.  They are then joined together with '.'.  Downstream
    # processes can split those chunks into pieces and consider them
    # separately or in various combinations if desired.

    fp = StringIO.StringIO()
    g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60)
    g.flatten(msg)
    text = fp.getvalue()
    body = text.split("\n\n", 1)[1]
    lines = clean(body).split("\n")
    chunksize = len(lines) // 4 + 1
    digest = []
    for i in range(4):
        chunk = "\n".join(lines[i * chunksize:(i + 1) * chunksize])
        digest.append(md5(chunk).hexdigest())

    return ".".join(digest)
Esempio n. 11
0
def generate_checksum(msg):
    # modelled after Justin Mason's fuzzy checksummer for SpamAssassin.
    # Message body is cleaned, then broken into lines.  The list of lines is
    # then broken into four parts and separate checksums are generated for
    # each part.  They are then joined together with '.'.  Downstream
    # processes can split those chunks into pieces and consider them
    # separately or in various combinations if desired.

    fp = StringIO.StringIO()
    g = email.generator.Generator(fp, mangle_from_=False, maxheaderlen=60)
    g.flatten(msg)
    text = fp.getvalue()
    body = text.split("\n\n", 1)[1]
    lines = clean(body).split("\n")
    chunksize = len(lines)//4+1
    digest = []
    for i in range(4):
        chunk = "\n".join(lines[i*chunksize:(i+1)*chunksize])
        digest.append(md5(chunk).hexdigest())

    return ".".join(digest)
Esempio n. 12
0
 def structure(self, ext=False):
     """Body structure data describes the MIME-IMB
     format of a message and consists of a sequence of mime type, mime
     subtype, parameters, content id, description, encoding, and size.
     The fields following the size field are variable: if the mime
     type/subtype is message/rfc822, the contained message's envelope
     information, body structure data, and number of lines of text; if
     the mime type is text, the number of lines of text.  Extension fields
     may also be included; if present, they are: the MD5 hash of the body,
     body disposition, body language."""
     s = []
     for part in self.walk():
         if part.get_content_charset() is not None:
             charset = ("charset", part.get_content_charset())
         else:
             charset = None
         part_s = [
             part.get_main_type(),
             part.get_subtype(), charset,
             part.get('Content-Id'),
             part.get('Content-Description'),
             part.get('Content-Transfer-Encoding'),
             str(len(part.as_string()))
         ]
         #if part.get_type() == "message/rfc822":
         #    part_s.extend([envelope, body_structure_data,
         #                  part.as_string().count("\n")])
         #elif part.get_main_type() == "text":
         if part.get_main_type() == "text":
             part_s.append(str(part.as_string().count("\n")))
         if ext:
             part_s.extend([
                 md5(part.as_string()).digest(),
                 part.get('Content-Disposition'),
                 part.get('Content-Language')
             ])
         s.append(part_s)
     if len(s) == 1:
         return s[0]
     return s
Esempio n. 13
0
 def extract_ocr_info(self, pnmfiles):
     assert self.engine, "must have an engine!"
     textbits = []
     tokens = set()
     for pnmfile in pnmfiles:
         preserve = False
         fhash = md5(open(pnmfile).read()).hexdigest()
         if fhash in self.cache:
             self.hits += 1
             ctext, ctokens = self.cache[fhash]
         else:
             self.misses += 1
             if self.engine.program:
                 try:
                     ctext = self.engine.extract_text(pnmfile).lower()
                 except SystemError, msg:
                     print >> sys.stderr, msg
                     preserve = True
                     ctext = ""
             else:
                 # We should not get here if no OCR is enabled.  If it
                 # is enabled and we have no program, its OK to spew lots
                 # of warnings - they should either disable OCR (it is by
                 # default), or fix their config.
                 print >> sys.stderr, \
                       "No OCR program '%s' available - can't get text!" \
                       % (self.engine.engine_name,)
                 ctext = ""
             ctokens = set()
             if not ctext.strip():
                 # Lots of spam now contains images in which it is
                 # difficult or impossible (using ocrad) to find any
                 # text.  Make a note of that.
                 ctokens.add("image-text:no text found")
             else:
                 nlines = len(ctext.strip().split("\n"))
                 if nlines:
                     ctokens.add("image-text-lines:%d" % int(log2(nlines)))
             self.cache[fhash] = (ctext, ctokens)
Esempio n. 14
0
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help'])
    except getopt.error as msg:
        usage(1, msg)
    doglob = False
    n = None
    verbose = False
    delete_dups = False
    for opt, arg in opts:
        if opt in ('-h', '--help'):
            usage(0)
        elif opt == '-g':
            doglob = True
        elif opt == '-s':
            random.seed(int(arg))
        elif opt == '-n':
            n = int(arg)
        elif opt == '-v':
            verbose = True
        elif opt == '-d':
            delete_dups = True
    if n is None or n <= 1:
        usage(1, "an -n value > 1 is required")
    if len(args) < 2:
        usage(1, "input mbox name and output base path are required")
    inputpaths, outputbasepath = args[:-1], args[-1]
    outdirs = [outputbasepath + ("%d" % i) for i in range(1, n+1)]
    for dir in outdirs:
        if not os.path.isdir(dir):
            os.makedirs(dir)
    counter = 0
    cksums = set()
    skipped = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]
        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                astext = str(msg)
                cksum = md5(astext).hexdigest()
                if delete_dups and cksum in cksums:
                    skipped += 1
                    continue
                cksums.add(cksum)
                i = random.randrange(n)
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
    if verbose:
        print()
        print(counter, "messages split into", n, "directories")
        if skipped:
            print("skipped", skipped, "duplicate messages")
Esempio n. 15
0
            os.makedirs(dir)

    counter = 0
    cksums = set()
    skipped = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]

        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                astext = str(msg)
                cksum = md5(astext).hexdigest()
                if delete_dups and cksum in cksums:
                    skipped += 1
                    continue
                cksums.add(cksum)
                i = random.randrange(n)
                #assert astext.endswith('\n')
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()
Esempio n. 16
0
            os.makedirs(dir)

    counter = 0
    cksums = set()
    skipped = 0
    for inputpath in inputpaths:
        if doglob:
            inpaths = glob.glob(inputpath)
        else:
            inpaths = [inputpath]

        for inpath in inpaths:
            mbox = mboxutils.getmbox(inpath)
            for msg in mbox:
                astext = str(msg)
                cksum = md5(astext).hexdigest()
                if delete_dups and cksum in cksums:
                    skipped += 1
                    continue
                cksums.add(cksum)
                i = random.randrange(n)
                #assert astext.endswith('\n')
                counter += 1
                msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
                msgfile.write(astext)
                msgfile.close()
                if verbose:
                    if counter % 100 == 0:
                        sys.stdout.write('.')
                        sys.stdout.flush()