def to_gini(results): """ calculate the stats (including gini impurities for each split), along with the weighted average of gini impurities """ results1, results2 = results stats1 = stats(results1) stats2 = stats(results2) if stats1[2] is np.inf or stats2[2] is np.inf: return np.inf, stats1, stats2 total = stats1[0] + stats2[0] return stats1[0] / total * stats1[2] + stats2[0] / total * stats2[2], stats1, stats2
def main(): db.init_db_engine(config.SQLALCHEMY_DATABASE_URI) total = db.data.get_count_pending_songs() done = 0 starttime = time.time() thisdone, rem = lookup() done += thisdone while rem > 0: thisdone, rem = lookup() done += thisdone durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def main(inputdir, outputdir, mbidfile): mbids = open(mbidfile).read().splitlines() done = 0 total = len(mbids) starttime = time.time() for m in mbids: process_one(inputdir, outputdir, m) done += 1 if done % 1000 == 0: durdelta, remdelta = util.stats(done, total, starttime) log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def test(self): tprint('Beginning testing.') confusion_matrix = np.zeros((7, 7)).astype(np.int) last_print = time.time() for minibatch, targets in self.dataset.test: minibatch = Variable(torch.stack(minibatch), volatile=True) targets = Variable(torch.LongTensor(targets), volatile=True) if self.cuda: minibatch = minibatch.cuda() targets = targets.cuda() out = self.model.forward(minibatch) _, predicted = torch.max(out.data, 1) predicted = predicted.cpu().numpy() targets = targets.data.cpu().numpy() confusion_matrix += sklearn.metrics.confusion_matrix( predicted, targets, labels=[0, 1, 2, 3, 4, 5, 6]).astype(np.int) if time.time() - last_print > self.log_interval: last_print = time.time() numer, denom = self.dataset.test.progress() tprint('Testing: %s/%s' % (numer, denom)) tprint('Testing complete.') print(confusion_matrix) print(tabulate.tabulate(stats(confusion_matrix)))
return fitnesses else: with open( genome + ".min" ) as fh: key = " ".join( fh.readlines() ) key = " ".join( key.split() ) d = shelve.open( genome + ".cache" ) return d[ key ] if not os.path.exists( config ): infomsg( "no configuration found", file = sys.stderr ) exit( 1 ) fitness = OrderedDict() with stats() as record: ######## # Compute baseline fitness ######## if not os.path.isdir( get_storage_dir( ".original" ) ): os.makedirs( get_storage_dir( ".original" ) ) original = get_genome_file( get_storage_dir( ".original" ), ".original" ) with open( original, 'w' ) as out: pass fitness[ "original" ] = get_minimized_fitness( original ) ######## # Compute, search, and minimize partitions ########
y_softmaxs, glimpses = predict_fn(x) # need to have glimpses len match y for the zip so swap # h0 "marker" for [None] # OMG such clumsy, much hack, terrible API. if glimpses[0].__class__ == np.float32: glimpses = [None] * len(y) for n, (y_true_i, y_softmax, glimpse) in enumerate(zip(y, y_softmaxs, glimpses)): print "(%s) -> (%s)" % (rb.LABELS[x[n]], rb.LABELS[y_true_i]) print " y_softmax", filter(lambda (label, prob): prob > 1e-4, sorted(zip(rb.LABELS, y_softmax), key=lambda (label, prob): -prob)[:5]) if glimpse is not None: glimpses_strs = util.float_array_to_str(glimpse) print " glimpse (f)", zip(rb.tokens_for(x), glimpses_strs[:len(x)]) # first half is from forward pass print " glimpse (b)", zip(rb.tokens_for(x), glimpses_strs[len(x):]) # second half is from backwards pass y_true_confidence = float(y_softmax[y_true_i]) probabilities.append(y_true_confidence) prob_seqs.append(probabilities) # dump some stats stats = OrderedDict() stats['epoch'] = epoch stats['costs'] = util.stats(costs) stats['perplexity'] = util.perplexity_stats(prob_seqs) stats['3rd_last'] = util.third_last_stats(prob_seqs) stats['epoch_time'] = time.time() - start_time stats['sample_gradient_l2_norms'] = sample_gradient_l2_norms print "STATS\t%s" % json.dumps(stats)
def train(self, dataRDD): """ trains the model. takes an RDD of tuple (label, feature_vector). The label should be binary, and the feature_vector must be a sequence """ # a generator for node IDs node_id_counter = count() # compute some preliminary statistics def seq_op(counts, row): label = row[0] counts[int(label)] += 1 return counts comb_op = lambda counts1, counts2: [counts1[0] + counts2[0], counts1[1] + counts2[1]] label_counts = dataRDD.aggregate([0, 0], seq_op, comb_op) n, probability, gini = stats(label_counts) # sample the training data to find where the bins should be to quantize the numerical features fraction = sample_fraction_for_accurate_splits(n, self.max_bins) sample = dataRDD.sample(withReplacement=False, fraction=fraction) continuous_bins = sample.flatMap(partial(spread_row, self.categorical_features_info)) \ .groupByKey() \ .mapValues(partial(to_bins, self.max_bins)) \ .collectAsMap() self.continuous_bins = continuous_bins treeRDD = dataRDD.map(lambda pair: (pair[0], discretize(continuous_bins, pair[1]))).persist() # initialize the decision tree tree_root = TreeNode(next(node_id_counter), n, gini, probability) # give every feature the full range tree_root.ranges = { i: set(range(self.categorical_features_info[i])) if i in self.categorical_features_info else (0, self.max_bins) for i in range(self.num_features) } self.tree_root = tree_root # grow the tree level-by-level. this holds the nodes to grow on the nexxt iteration frontier = [tree_root] depth = 0 while len(frontier) > 0: category_stats = {} if len(self.categorical_features_info) > 0: category_stats = treeRDD.flatMap(partial(spread_categories, tree_root, self.categorical_features_info)) \ .reduceByKey(np.add) \ .mapValues(counts_to_prob) \ .map(shift_key(1)) \ .groupByKey() \ .mapValues(dict) \ .collect() category_stats = to_nested_dict(category_stats) # have the master determine which splits should be considered candidate_splits = {} for node in frontier: candidate_splits[node.id] = gen_candidate_splits(node, self.num_features, self.feature_subset_strategy, category_stats) # collect statistics about each of the proposed splits. # for each training example, take all the candidate splits and emit a key-value pair indicating # how that example would get classified. the key-value pair looks like this # (node-ID, feature-index, split-value): [[1, 0], [0, 0]] # then those classifications are reduced to add them up. so the values become [[negative-examples in left split, positive-examples in left split], [neg-ex in right split, pos-ex in right split]] # finally, the label counts are mapped to statistics in the form: # (node-ID, feature-index, split-value): (weighted_gini, (total_n_left, probability_positive_left, gini_left), (total_n_right, probability_positive_right, gini_right) statsRDD = treeRDD.flatMap(partial(split_statistics, tree_root, candidate_splits)) \ .reduceByKey(result_adder) \ .mapValues(to_gini) # find the best split for each node in the frontier (lowest gini impurity) # the key is converted from (node-ID, feature-index, split-value) to just node-ID # this allows us to reduce and get for each node ID only the split with the lowest gini impurity best_splits = statsRDD.map(shift_key_3_to_1) \ .reduceByKey(get_purest_split) \ .collectAsMap() # start collecting the new level of children for the next iteration new_frontier = [] # for each node, do the best split we found for node in frontier: # bail on this iteration of no splits were found if node.id not in best_splits: continue split_data = best_splits[node.id] (split_feat, split_val), _, (left_n, left_proba, left_gini), (right_n, right_proba, right_gini) = split_data # bail if there aren't enough data points to make the split if left_n < self.min_per_node or right_n < self.min_per_node: continue # create the child nodes, and give them the correct ranges parent_range = node.feat_range(split_feat) left = TreeNode(next(node_id_counter), left_n, left_gini, left_proba) right = TreeNode(next(node_id_counter), right_n, right_gini, right_proba) if split_feat not in self.categorical_features_info: left.ranges = { split_feat: (parent_range[0], split_val + 1) } right.ranges = { split_feat: (split_val + 1, parent_range[1]) } node.split(split_feat, split_val, left, right) else: parent_range_list = sorted(parent_range, key=lambda el: category_stats[node.id][split_feat].get(el, 0)) split_val_set = set(parent_range_list[:split_val + 1]) left.ranges = { split_feat: split_val_set } right.ranges = { split_feat: parent_range - split_val_set } node.split(split_feat, split_val_set, left, right) # only add these to the frontier if they aren't homogenous if 0 < left.probability and left.probability < 1: new_frontier.append(left) if 0 < right.probability and right.probability < 1: new_frontier.append(right) depth += 1 # bail if the depth exceeds the max if depth >= self.max_depth: frontier = [] else: frontier = new_frontier # remove from cache before returning treeRDD.unpersist()