def train_sample(feature_str, label, pos_train=0.5, neg_train=1000): """Perform training and testing using disjont samples from the full set of label_values. This is equivalent to doing one round of cross validation (see classipy.cross_validation) only it keeps the values around for display. Args: """ all_hashes = list(cass.get_image_hashes()) pos_hashes = [_[0] for _ in cass.buffered_get_row(cf_labels, label)] neg_hashes = list(set(all_hashes) - set(pos_hashes)) if 0 < pos_train <= 1: pos_train = int(pos_train * len(pos_hashes)) if 0 < neg_train <= 1: neg_train = int(neg_train * len(neg_hashes)) # Choose a training sample and a testing sample if len(pos_hashes) < pos_train: raise ValueError('Not enough positive examples %s(%d)' % \ (label, len(pos_hashes))) if len(neg_hashes) < neg_train: raise ValueError('Not enough negative examples %s(%d)' % \ (label, len(neg_hashes))) pos_sample = random.sample(pos_hashes, pos_train) neg_sample = random.sample(neg_hashes, neg_train) labels = [-1 for _ in neg_sample] + [1 for _ in pos_sample] values = cass.get_feature_values(feature_str, neg_sample+pos_sample) global label_values label_values = zip(labels, values) print 'Training classifier with sample %d' % len(label_values) train_classifier(label_values)
def split_train_test(label, pos_train=0.5, neg_train=1000): """Splits the data into a training and testing sample. Arguments: pos_train: 0 < pos_train <= 1.0: the portion of the set to use for training 1 < pos_train: the number of images to use for training Returns: (train, test): Training and testing set, where train: Iterable of tuples for a training set [(-1|1, hash), ...] test: Iterable of tuples for a testing set [(-1|1, hash), ...] """ all_hashes = set(cass.get_image_hashes()) pos_hashes = [_[0] for _ in cass.buffered_get_row(cf_labels, label)] neg_hashes = set(all_hashes) - set(pos_hashes) if 0 < pos_train <= 1: pos_train = int(pos_train * len(pos_hashes)) if 0 < neg_train <= 1: neg_train = int(neg_train * len(neg_hashes)) pos_sample = random.sample(pos_hashes, pos_train) neg_sample = random.sample(neg_hashes, neg_train) # Sample from pos and neg train = [(-1, k) for k in neg_sample] + \ [(1, k) for k in pos_sample] neg_sample = random.sample(neg_hashes, len(pos_hashes)) # Use all of the images for testing for now test = [(-1, k) for k in neg_sample] + \ [(1, k) for k in pos_hashes] print 'Train: %d Test: %d' % (len(train), len(test)) return train, test