def compile_training_data(): curr.execute("SELECT added, deled, is_good FROM training_diffs") #format {"word": [a_spam, d_spam, a_good, d_good]} words = {} print "Begin sum words" for row in curr.fetchall(): added = row[0].lower() deled = row[1].lower() added_words = re.findall(r'[\w]+', added) deled_words = re.findall(r'[\w]+', deled) for word in added_words: if is_blacklisted(word): continue if word not in words: words[word] = get_word_status(word) if row[2] == 0: words[word][0] += 1 else: words[word][2] += 1 for word in deled_words: if is_blacklisted(word): continue if word not in words: words[word] = get_word_status(word) if row[2] == 0: words[word][1] += 1 else: words[word][3] += 1 print "Begin commit word updates" curr.execute("DELETE FROM training_words") for word in words: word_data = words[word] curr.execute("INSERT INTO training_words (word, add_spam, add_good, del_spam, del_good) \ VALUES (%(word)s, %(aspam)s, %(agood)s, %(dspam)s, %(dgood)s)", {"word":word, "aspam": word_data[0], "dspam":word_data[1], "agood":word_data[2], "dgood":word_data[3]}) conn.commit() print "Begin commiting probabilities" curr.execute("SELECT sum(add_spam) + sum(add_good) + sum(del_spam) + sum(del_good) FROM training_words LIMIT 1") zum = curr.fetchone() zum = zum[0] curr.execute("DELETE FROM classifier_cache") curr.execute("SELECT word, add_spam, add_good, del_spam, del_good FROM training_words") for row in curr.fetchall(): curr.execute("INSERT INTO classifier_cache (word, p_add_spam, p_add_good, p_del_spam, p_del_good) VALUES \ (%(word)s, %(aspam)s::float/%(sum)s, %(agood)s::float/%(sum)s, %(dspam)s::float/%(sum)s, %(dgood)s::float/%(sum)s)", {"word" : row[0], "aspam":row[1]*1000.0, "agood":row[2]*1000.0, "dspam":row[3]*1000.0, "dgood":row[4]*1000.0, "sum":zum}) conn.commit() print "Done"
def name_is_non_test(self): """Check if the file name matches the conditions for the file to be a non-test file""" return (os.path.isdir(self.rel_path) or self.name_prefix("MANIFEST") or self.filename.startswith(".") or is_blacklisted(self.url))
def local_changes(self): # Put all files into local_changes and rely on Manifest.update to de-dupe # changes that in fact committed at the base rev. rv = [] for dir_path, dir_names, filenames in os.walk(self.tests_root): for filename in filenames: if any(fnmatch(filename, pattern) for pattern in self.ignore): continue rel_path = os.path.relpath(os.path.join(dir_path, filename), self.tests_root) if is_blacklisted(rel_path_to_url(rel_path, self.url_base)): continue rv.append((rel_path, "modified")) return dict(rv)
def committed_changes(self, base_rev=None): if base_rev is None: self.logger.debug("Adding all changesets to the manifest") return [(item, "modified") for item in self.paths()] self.logger.debug("Updating the manifest from %s to %s" % (base_rev, self.current_rev())) rv = [] data = self.git("diff", "-z", "--name-status", base_rev + "..HEAD") items = data.split("\0") for status, filename in chunks(items, 2): if is_blacklisted(rel_path_to_url(filename, self.url_base)): continue if status == "D": rv.append((filename, "deleted")) else: rv.append((filename, "modified")) return rv
def local_changes(self, path=None): # -z is stable like --porcelain; see the git status documentation for details cmd = ["status", "-z", "--ignore-submodules=all"] if path is not None: cmd.extend(["--", path]) rv = {} data = self.git(*cmd) if data == "": return rv assert data[-1] == "\0" f = StringIO(data) while f.tell() < len(data): # First two bytes are the status in the stage (index) and working tree, respectively staged = f.read(1) worktree = f.read(1) assert f.read(1) == " " if staged == "R": # When a file is renamed, there are two files, the source and the destination files = 2 else: files = 1 filenames = [] for i in range(files): filenames.append("") char = f.read(1) while char != "\0": filenames[-1] += char char = f.read(1) if not is_blacklisted(rel_path_to_url(filenames[0], self.url_base)): rv.update(self.local_status(staged, worktree, filenames)) return rv