Ejemplo n.º 1
0
def to_gini(results):
    """
    calculate the stats (including gini impurities for each split), along with the weighted average of gini impurities
    """
    results1, results2 = results
    stats1 = stats(results1)
    stats2 = stats(results2)
    if stats1[2] is np.inf or stats2[2] is np.inf:
        return np.inf, stats1, stats2
    total = stats1[0] + stats2[0]
    return stats1[0] / total * stats1[2] + stats2[0] / total * stats2[2], stats1, stats2
Ejemplo n.º 2
0
def main():
    db.init_db_engine(config.SQLALCHEMY_DATABASE_URI)
    total = db.data.get_count_pending_songs()
    done = 0
    starttime = time.time()
    thisdone, rem = lookup()
    done += thisdone
    while rem > 0:
        thisdone, rem = lookup()
        done += thisdone
        durdelta, remdelta = util.stats(done, total, starttime)
        log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
Ejemplo n.º 3
0
def main(inputdir, outputdir, mbidfile):
    mbids = open(mbidfile).read().splitlines()

    done = 0
    total = len(mbids)
    starttime = time.time()

    for m in mbids:
        process_one(inputdir, outputdir, m)
        done += 1
        if done % 1000 == 0:
            durdelta, remdelta = util.stats(done, total, starttime)
            log.info("Done %s/%s in %s; %s remaining", done, total, str(durdelta), str(remdelta))
def main(inputdir, outputdir, mbidfile):
    mbids = open(mbidfile).read().splitlines()

    done = 0
    total = len(mbids)
    starttime = time.time()

    for m in mbids:
        process_one(inputdir, outputdir, m)
        done += 1
        if done % 1000 == 0:
            durdelta, remdelta = util.stats(done, total, starttime)
            log.info("Done %s/%s in %s; %s remaining", done, total,
                     str(durdelta), str(remdelta))
Ejemplo n.º 5
0
 def test(self):
     tprint('Beginning testing.')
     confusion_matrix = np.zeros((7, 7)).astype(np.int)
     last_print = time.time()
     for minibatch, targets in self.dataset.test:
         minibatch = Variable(torch.stack(minibatch), volatile=True)
         targets = Variable(torch.LongTensor(targets), volatile=True)
         if self.cuda:
             minibatch = minibatch.cuda()
             targets = targets.cuda()
         out = self.model.forward(minibatch)
         _, predicted = torch.max(out.data, 1)
         predicted = predicted.cpu().numpy()
         targets = targets.data.cpu().numpy()
         confusion_matrix += sklearn.metrics.confusion_matrix(
             predicted, targets, labels=[0, 1, 2, 3, 4, 5,
                                         6]).astype(np.int)
         if time.time() - last_print > self.log_interval:
             last_print = time.time()
             numer, denom = self.dataset.test.progress()
             tprint('Testing: %s/%s' % (numer, denom))
     tprint('Testing complete.')
     print(confusion_matrix)
     print(tabulate.tabulate(stats(confusion_matrix)))
Ejemplo n.º 6
0
            return fitnesses
    else:
        with open( genome + ".min" ) as fh:
            key = " ".join( fh.readlines() )
        key = " ".join( key.split() )

        d = shelve.open( genome + ".cache" )
        return d[ key ]

if not os.path.exists( config ):
    infomsg( "no configuration found", file = sys.stderr )
    exit( 1 )

fitness = OrderedDict()

with stats() as record:
    ########
    # Compute baseline fitness
    ########

    if not os.path.isdir( get_storage_dir( ".original" ) ):
        os.makedirs( get_storage_dir( ".original" ) )
    original = get_genome_file( get_storage_dir( ".original" ), ".original" )
    with open( original, 'w' ) as out:
        pass
    fitness[ "original" ] = get_minimized_fitness( original )

    ########
    # Compute, search, and minimize partitions
    ########
Ejemplo n.º 7
0
Archivo: rnn.py Proyecto: vyraun/rnn_lm
        y_softmaxs, glimpses = predict_fn(x)

        # need to have glimpses len match y for the zip so swap
        # h0 "marker" for [None]
        # OMG such clumsy, much hack, terrible API.
        if glimpses[0].__class__ == np.float32:
            glimpses = [None] * len(y)

        for n, (y_true_i, y_softmax, glimpse) in enumerate(zip(y, y_softmaxs, glimpses)):
            print "(%s) -> (%s)" % (rb.LABELS[x[n]], rb.LABELS[y_true_i])
            print "  y_softmax", filter(lambda (label, prob): prob > 1e-4,
                                        sorted(zip(rb.LABELS, y_softmax),
                                               key=lambda (label, prob): -prob)[:5])
            if glimpse is not None:
                glimpses_strs = util.float_array_to_str(glimpse)
                print "  glimpse (f)", zip(rb.tokens_for(x), glimpses_strs[:len(x)])  # first half is from forward pass
                print "  glimpse (b)", zip(rb.tokens_for(x), glimpses_strs[len(x):])  # second half is from backwards pass
            y_true_confidence = float(y_softmax[y_true_i])
            probabilities.append(y_true_confidence)
        prob_seqs.append(probabilities)

    # dump some stats
    stats = OrderedDict()
    stats['epoch'] = epoch
    stats['costs'] = util.stats(costs)
    stats['perplexity'] = util.perplexity_stats(prob_seqs)
    stats['3rd_last'] = util.third_last_stats(prob_seqs)
    stats['epoch_time'] = time.time() - start_time
    stats['sample_gradient_l2_norms'] = sample_gradient_l2_norms
    print "STATS\t%s" % json.dumps(stats)
Ejemplo n.º 8
0
    def train(self, dataRDD):
        """
        trains the model. takes an RDD of tuple (label, feature_vector). The label should be binary,
        and the feature_vector must be a sequence
        """
        # a generator for node IDs
        node_id_counter = count()

        # compute some preliminary statistics
        def seq_op(counts, row):
            label = row[0]
            counts[int(label)] += 1
            return counts
        comb_op = lambda counts1, counts2: [counts1[0] + counts2[0], counts1[1] + counts2[1]]
        label_counts = dataRDD.aggregate([0, 0], seq_op, comb_op)
        n, probability, gini = stats(label_counts)

        # sample the training data to find where the bins should be to quantize the numerical features
        fraction = sample_fraction_for_accurate_splits(n, self.max_bins)
        sample = dataRDD.sample(withReplacement=False, fraction=fraction)
        continuous_bins = sample.flatMap(partial(spread_row, self.categorical_features_info)) \
            .groupByKey() \
            .mapValues(partial(to_bins, self.max_bins)) \
            .collectAsMap()
        self.continuous_bins = continuous_bins
        treeRDD = dataRDD.map(lambda pair: (pair[0], discretize(continuous_bins, pair[1]))).persist()

        # initialize the decision tree
        tree_root = TreeNode(next(node_id_counter), n, gini, probability)
        # give every feature the full range
        tree_root.ranges = { i: set(range(self.categorical_features_info[i])) if i in self.categorical_features_info else (0, self.max_bins) for i in range(self.num_features) }
        self.tree_root = tree_root
        # grow the tree level-by-level. this holds the nodes to grow on the nexxt iteration
        frontier = [tree_root]
        depth = 0

        while len(frontier) > 0:
            category_stats = {}
            if len(self.categorical_features_info) > 0:
                category_stats = treeRDD.flatMap(partial(spread_categories, tree_root, self.categorical_features_info)) \
                    .reduceByKey(np.add) \
                    .mapValues(counts_to_prob) \
                    .map(shift_key(1)) \
                    .groupByKey() \
                    .mapValues(dict) \
                    .collect()
                category_stats = to_nested_dict(category_stats)

            # have the master determine which splits should be considered
            candidate_splits = {}
            for node in frontier:
                candidate_splits[node.id] = gen_candidate_splits(node, self.num_features, self.feature_subset_strategy, category_stats)
            # collect statistics about each of the proposed splits.
            # for each training example, take all the candidate splits and emit a key-value pair indicating
            # how that example would get classified. the key-value pair looks like this
            # (node-ID, feature-index, split-value): [[1, 0], [0, 0]]
            # then those classifications are reduced to add them up. so the values become [[negative-examples in left split, positive-examples in left split], [neg-ex in right split, pos-ex in right split]]
            # finally, the label counts are mapped to statistics in the form:
            # (node-ID, feature-index, split-value): (weighted_gini, (total_n_left, probability_positive_left, gini_left), (total_n_right, probability_positive_right, gini_right)
            statsRDD = treeRDD.flatMap(partial(split_statistics, tree_root, candidate_splits)) \
                .reduceByKey(result_adder) \
                .mapValues(to_gini)

            # find the best split for each node in the frontier (lowest gini impurity)
            # the key is converted from (node-ID, feature-index, split-value) to just node-ID
            # this allows us to reduce and get for each node ID only the split with the lowest gini impurity
            best_splits = statsRDD.map(shift_key_3_to_1) \
                .reduceByKey(get_purest_split) \
                .collectAsMap()

            # start collecting the new level of children for the next iteration
            new_frontier = []
            # for each node, do the best split we found
            for node in frontier:
                # bail on this iteration of no splits were found
                if node.id not in best_splits:
                    continue
                split_data = best_splits[node.id]
                (split_feat, split_val), _, (left_n, left_proba, left_gini), (right_n, right_proba, right_gini) = split_data
                # bail if there aren't enough data points to make the split
                if left_n < self.min_per_node or right_n < self.min_per_node:
                    continue

                # create the child nodes, and give them the correct ranges
                parent_range = node.feat_range(split_feat)
                left = TreeNode(next(node_id_counter), left_n, left_gini, left_proba)
                right = TreeNode(next(node_id_counter), right_n, right_gini, right_proba)
                if split_feat not in self.categorical_features_info:
                    left.ranges = { split_feat: (parent_range[0], split_val + 1) }
                    right.ranges = { split_feat: (split_val + 1, parent_range[1]) }
                    node.split(split_feat, split_val, left, right)
                else:
                    parent_range_list = sorted(parent_range, key=lambda el: category_stats[node.id][split_feat].get(el, 0))
                    split_val_set = set(parent_range_list[:split_val + 1])
                    left.ranges = { split_feat: split_val_set }
                    right.ranges = { split_feat: parent_range - split_val_set }
                    node.split(split_feat, split_val_set, left, right)
                # only add these to the frontier if they aren't homogenous
                if 0 < left.probability and left.probability < 1:
                    new_frontier.append(left)
                if 0 < right.probability and right.probability < 1:
                    new_frontier.append(right)

            depth += 1
            # bail if the depth exceeds the max
            if depth >= self.max_depth:
                frontier = []
            else:
                frontier = new_frontier

        # remove from cache before returning
        treeRDD.unpersist()