Beispiel #1
0
def compile_training_data():
    curr.execute("SELECT added, deled, is_good FROM training_diffs")
    #format {"word": [a_spam, d_spam, a_good, d_good]}
    words = {}
    print "Begin sum words"
    for row in curr.fetchall():
        added = row[0].lower()
        deled = row[1].lower()
        added_words = re.findall(r'[\w]+', added)
        deled_words = re.findall(r'[\w]+', deled)
        for word in added_words:
            if is_blacklisted(word):
                continue
            if word not in words:
                words[word] = get_word_status(word)
            if row[2] == 0:
                words[word][0] += 1
            else:
                words[word][2] += 1
        for word in deled_words:
            if is_blacklisted(word):
                continue
            if word not in words:
                words[word] = get_word_status(word)
            if row[2] == 0:
                words[word][1] += 1
            else:
                words[word][3] += 1
    print "Begin commit word updates"
    curr.execute("DELETE FROM training_words")
    for word in words:
        word_data = words[word]
        curr.execute("INSERT INTO training_words (word, add_spam, add_good, del_spam, del_good) \
                    VALUES (%(word)s, %(aspam)s, %(agood)s, %(dspam)s, %(dgood)s)", 
                {"word":word, "aspam": word_data[0], "dspam":word_data[1], "agood":word_data[2], "dgood":word_data[3]})

    conn.commit()
    print "Begin commiting probabilities"
    curr.execute("SELECT sum(add_spam) + sum(add_good) + sum(del_spam) + sum(del_good) FROM training_words LIMIT 1")

    zum = curr.fetchone()
    zum = zum[0]

    curr.execute("DELETE FROM classifier_cache")
    curr.execute("SELECT word, add_spam, add_good, del_spam, del_good FROM training_words")
    for row in curr.fetchall():
        curr.execute("INSERT INTO classifier_cache (word, p_add_spam, p_add_good, p_del_spam, p_del_good) VALUES \
                (%(word)s, %(aspam)s::float/%(sum)s, %(agood)s::float/%(sum)s, %(dspam)s::float/%(sum)s, %(dgood)s::float/%(sum)s)",
                {"word" : row[0], "aspam":row[1]*1000.0, "agood":row[2]*1000.0, "dspam":row[3]*1000.0, "dgood":row[4]*1000.0, "sum":zum})
    conn.commit()
    print "Done"
 def name_is_non_test(self):
     """Check if the file name matches the conditions for the file to
     be a non-test file"""
     return (os.path.isdir(self.rel_path) or
             self.name_prefix("MANIFEST") or
             self.filename.startswith(".") or
             is_blacklisted(self.url))
Beispiel #3
0
 def name_is_non_test(self):
     """Check if the file name matches the conditions for the file to
     be a non-test file"""
     return (os.path.isdir(self.rel_path) or
             self.name_prefix("MANIFEST") or
             self.filename.startswith(".") or
             is_blacklisted(self.url))
Beispiel #4
0
    def local_changes(self):
        # Put all files into local_changes and rely on Manifest.update to de-dupe
        # changes that in fact committed at the base rev.

        rv = []
        for dir_path, dir_names, filenames in os.walk(self.tests_root):
            for filename in filenames:
                if any(fnmatch(filename, pattern) for pattern in self.ignore):
                    continue
                rel_path = os.path.relpath(os.path.join(dir_path, filename),
                                           self.tests_root)
                if is_blacklisted(rel_path_to_url(rel_path, self.url_base)):
                    continue
                rv.append((rel_path, "modified"))
        return dict(rv)
Beispiel #5
0
    def committed_changes(self, base_rev=None):
        if base_rev is None:
            self.logger.debug("Adding all changesets to the manifest")
            return [(item, "modified") for item in self.paths()]

        self.logger.debug("Updating the manifest from %s to %s" % (base_rev, self.current_rev()))
        rv = []
        data  = self.git("diff", "-z", "--name-status", base_rev + "..HEAD")
        items = data.split("\0")
        for status, filename in chunks(items, 2):
            if is_blacklisted(rel_path_to_url(filename, self.url_base)):
                continue
            if status == "D":
                rv.append((filename, "deleted"))
            else:
                rv.append((filename, "modified"))
        return rv
Beispiel #6
0
    def local_changes(self, path=None):
        # -z is stable like --porcelain; see the git status documentation for details
        cmd = ["status", "-z", "--ignore-submodules=all"]
        if path is not None:
            cmd.extend(["--", path])

        rv = {}

        data = self.git(*cmd)
        if data == "":
            return rv

        assert data[-1] == "\0"
        f = StringIO(data)

        while f.tell() < len(data):
            # First two bytes are the status in the stage (index) and working tree, respectively
            staged = f.read(1)
            worktree = f.read(1)
            assert f.read(1) == " "

            if staged == "R":
                # When a file is renamed, there are two files, the source and the destination
                files = 2
            else:
                files = 1

            filenames = []

            for i in range(files):
                filenames.append("")
                char = f.read(1)
                while char != "\0":
                    filenames[-1] += char
                    char = f.read(1)

            if not is_blacklisted(rel_path_to_url(filenames[0], self.url_base)):
                rv.update(self.local_status(staged, worktree, filenames))

        return rv