Ejemplo n.º 1
0
def train_sample(feature_str, label, pos_train=0.5, neg_train=1000):
    """Perform training and testing using disjont samples from the full
    set of label_values. This is equivalent to doing one round of cross
    validation (see classipy.cross_validation) only it keeps the values around
    for display.

    Args:

    """
    all_hashes = list(cass.get_image_hashes())
    pos_hashes = [_[0] for _ in cass.buffered_get_row(cf_labels, label)]
    neg_hashes = list(set(all_hashes) - set(pos_hashes))

    if 0 < pos_train <= 1: pos_train = int(pos_train * len(pos_hashes))
    if 0 < neg_train <= 1: neg_train = int(neg_train * len(neg_hashes))

    # Choose a training sample and a testing sample
    if len(pos_hashes) < pos_train:
        raise ValueError('Not enough positive examples %s(%d)' % \
                         (label, len(pos_hashes)))
    if len(neg_hashes) < neg_train:
        raise ValueError('Not enough negative examples %s(%d)' % \
                         (label, len(neg_hashes)))

    pos_sample = random.sample(pos_hashes, pos_train)
    neg_sample = random.sample(neg_hashes, neg_train)

    labels = [-1 for _ in neg_sample] + [1 for _ in pos_sample]
    values = cass.get_feature_values(feature_str, neg_sample+pos_sample)

    global label_values
    label_values = zip(labels, values)

    print 'Training classifier with sample %d' % len(label_values)
    train_classifier(label_values)
Ejemplo n.º 2
0
def split_train_test(label, pos_train=0.5, neg_train=1000):
    """Splits the data into a training and testing sample.
    Arguments:
       pos_train:
          0 < pos_train <= 1.0: the portion of the set to use for training
          1 < pos_train: the number of images to use for training

    Returns:
       (train, test): Training and testing set, where

          train: Iterable of tuples for a training set
             [(-1|1, hash), ...]

          test: Iterable of tuples for a testing set
             [(-1|1, hash), ...]
    """
    all_hashes = set(cass.get_image_hashes())
    pos_hashes = [_[0] for _ in cass.buffered_get_row(cf_labels, label)]
    neg_hashes = set(all_hashes) - set(pos_hashes)

    if 0 < pos_train <= 1:
        pos_train = int(pos_train * len(pos_hashes))
    if 0 < neg_train <= 1:
        neg_train = int(neg_train * len(neg_hashes))

    pos_sample = random.sample(pos_hashes, pos_train)
    neg_sample = random.sample(neg_hashes, neg_train)

    # Sample from pos and neg
    train = [(-1, k) for k in neg_sample] + \
            [(1, k) for k in pos_sample]

    neg_sample = random.sample(neg_hashes, len(pos_hashes))

    # Use all of the images for testing for now
    test = [(-1, k) for k in neg_sample] + \
           [(1, k) for k in pos_hashes]

    print 'Train: %d   Test: %d' % (len(train), len(test))

    return train, test
Ejemplo n.º 3
0
def build_label_index(buffer_size=1000):
    # Get label
    def reverse(items):
        for k, v in items:
            d = json.loads(v)
            g = re.match('(\w+)-(?:\d+)\.jpg', d['filename'])
            if g: yield g.groups()[0], {k:''}

    items = cass.buffered_get_row(cass.cf_images, 'image_metadata')
    items = reverse(items)

    # Insert into cass by chunks
    count = 0
    while items:
        chunk = list(itertools.islice(items, buffer_size))
        for row, column in chunk:
            cf_labels.insert(row, column)
        count += len(chunk)
        if not chunk: break

    print("Built index with %d items" % count)