Example #1
0
def fetchPool(argv):
    args = parseArguments()
    
    if args.test_mode:
        if args.out:
            with open(args.out, 'w') as f:
                f.write(args.pool)
        return

    prev_poolId = ObjectId(args.pool)
    poolId = pcoll.makePool(args.description)
    sys.stdout.write("Pool created with ID: %s\n" % poolId)
    
    pool_size = pcoll.getPoolSize(args.pool)

    for index, row in enumerate(icoll.getPoolUrlsIterator(args.pool)):
        pool_info = [p for p in row['pools'] if p['poolId'] == prev_poolId][0]
        if random.random() < args.rate:
            icoll.assignToPool(row, poolId, pool_info['target'])
        progress_bar.printProgress(index + 1, pool_size)

    if args.out:
        with open(args.out, 'w') as f:
            f.write(str(poolId))

    return
Example #2
0
def makePool(argv):
    args = parseArguments()
    imageUrls = []

    poolId = pcoll.makePool(args.description)
    print "Getting random images..."
    for row in icoll.findRandomImageUrlsDocuments(args.capacity):
        imageUrls.append(row)

    print "Assigning images to the pool"
    imageUrlsSize = len(imageUrls)
    for index, row in enumerate(imageUrls, 1):
        icoll.assignToPool(row, poolId, 1)
        progress_bar.printProgress(index, imageUrlsSize)

    return poolId
def fetch_validation(argv):
    args = parseArguments()

    if args.test_mode:
        print("TEST MODE")

    predictor = Predictor(args.model_id)

    i = 0
    totalCount = vcoll.getSize()
    with vcoll.getImageIterator() as cursor:
        cursor.batch_size(100)
        for row in cursor:
            if 'slices' in row:
                prediction = predictor.predict_on_slices(row['slices'])[0]
                vcoll.updatePrediction(row, args.model_id, prediction)
            i += 1
            progress_bar.printProgress(i, totalCount)

        print()
Example #4
0
def fetch_validation(argv):
    args = parseArguments()

    if args.test_mode:
        print("TEST MODE")

    i = 0
    totalCount = vcoll.getSize()
    extractors = sliceFactory.getExtractors()
    with vcoll.getImageIterator() as cursor:
        cursor.batch_size(100)
        for row in cursor:
            im = None
            slices = {}
            for extractor in extractors:
                extractor_name = extractor.getName()
                if 'slices' in row and extractor_name in row['slices'] and row['slices'][extractor_name]['version'] == extractor.getVersion():
                    continue # no need to go on if features already extracted with the current version
                if im is None:
                    im = image_handler.get_image(row['path'])
                    ratio = max(500. / im.shape[0], 500. / im.shape[1])
                    im = cv2.resize(im, (0,0), fx=ratio, fy=ratio)
                features = extractor.extract(im)
                if features[0] is not None:
                    entry = {}
                    entry['features'] = features[0].tolist()
                    entry['version'] = extractor.getVersion()
                    slices[extractor_name] = entry

            if len(slices) > 0:
                vcoll.updateSlices(row, slices)

            i += 1
            progress_bar.printProgress(i, totalCount)

        print()
Example #5
0
def fetchPool(argv):
    args = parseArguments()

    if args.test_mode:
        print("TEST MODE")
        return

    workingDir = config.getDataPath()
    storePath = os.path.join(workingDir, 'images')

    totalCount = pcoll.getPoolSize(args.pool)
    if args.ignore_caching:
        i = 0
        futures = []
        with ThreadPoolExecutor(max_workers=args.threads_number) as executor:
            with icoll.getPoolUrlsIterator(args.pool) as cursor:
                cursor.batch_size(100)
                for row in cursor:
                    future = executor.submit(downloadAndUpdateImage, storePath,
                                             row)
                    futures.append(future)
                print()

            print("Executors scheduled")
            for f in as_completed(futures):
                i += 1
                progress_bar.printProgress(i, totalCount)
            print()
        print("Pool cached")
    else:
        print("Ignoring pool caching")
    print("Extracting features...")

    i = 0
    extractors = sliceFactory.getExtractors()
    with icoll.getPoolUrlsIterator(args.pool) as cursor:
        cursor.batch_size(100)
        for row in cursor:
            if row['valid_image']:
                im = None
                slices = {}
                for extractor in extractors:
                    extractor_name = extractor.getName()
                    if 'slices' in row and extractor_name in row[
                            'slices'] and row['slices'][extractor_name][
                                'version'] == extractor.getVersion():
                        continue  # no need to go on if features already extracted with the current version
                    if im is None:
                        im = image_handler.get_image(row['path'])
                    features = extractor.extract(im)
                    if features[0] is not None:
                        entry = {}
                        entry['features'] = features[0].tolist()
                        entry['version'] = extractor.getVersion()
                        slices[extractor_name] = entry

                if len(slices) > 0:
                    icoll.updateImageSlices(row, slices)

            i += 1
            progress_bar.printProgress(i, totalCount)
        print()

    return
Example #6
0
def train_classifier(poolId, nId, name, include_test, slices, description,
                     out_model_id):
    slices = sorted(
        slices
    )  # sort the list of slices just in case of misunderstanding in the future
    slices_set = set(slices)
    if not description:
        description = poolId + ' classification'
    assert type(nId) != None

    pd.set_option('display.expand_frame_repr', False)

    # check slices and init variables
    extractors = sliceFactory.getExtractors()
    slice_to_version = dict()
    for extractor in extractors:
        extractor_name = extractor.getName()
        if extractor_name not in slices_set:
            continue
        slice_to_version[extractor_name] = extractor.getVersion()
    assert len(slice_to_version) == len(slices)
    slices_descriptor = [{
        'name': extractor_name,
        'version': slice_to_version[extractor_name]
    } for extractor_name in slices]

    targets = []
    features = []
    for i, r in enumerate(
            icoll.getPoolUrlsIterator(poolId, include_test_set=include_test)):
        if 'slices' not in r:
            continue
        if not slices_set.issubset(r['slices']):
            continue
        targets.append([
            desc for desc in r['pools'] if str(desc['poolId']) == poolId
        ][0]['target'])
        featureVector = []
        for sliceName in slices:
            if r['slices'][sliceName]['version'] != slice_to_version[sliceName]:
                raise Exception("Slice is outdated")
            featureVector += r['slices'][sliceName]['features']
        features.append(featureVector)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        targets,
                                                        test_size=0.33)

    stats = []
    # Try to find best regularization parameters
    iter_values = list(range(0, 10))
    for i in range(len(iter_values)):
        C = 2**(iter_values[i] / 2.)
        clf = svm.SVC(kernel='rbf', C=C)
        accuracy = cross_val_score(clf,
                                   features,
                                   targets,
                                   cv=5,
                                   scoring='accuracy')
        f1 = cross_val_score(clf, features, targets, cv=5, scoring='f1')
        precision = cross_val_score(clf,
                                    features,
                                    targets,
                                    cv=5,
                                    scoring='precision')
        recall = cross_val_score(clf,
                                 features,
                                 targets,
                                 cv=5,
                                 scoring='recall')
        roc_auc = cross_val_score(clf,
                                  features,
                                  targets,
                                  cv=5,
                                  scoring='roc_auc')
        entry = {
            'model': 'SVC',
            'param': C,
            'accuracy': accuracy.mean(),
            'accuracy_interval90': accuracy.std() * 2,
            'f1': f1.mean(),
            'precision': precision.mean(),
            'recall': recall.mean(),
            'roc_auc': roc_auc.mean()
        }
        #print '\t'.join([str(C)] + map(str, [v for k, v in entry.iteritems()]))
        stats.append(entry)
        progress_bar.printProgress(i + 1, len(iter_values))
        print(entry)
    print()

    stats = pd.DataFrame(stats)
    stats.sort_values(by=['f1', 'precision', 'roc_auc'],
                      inplace=True,
                      ascending=False)
    print(stats)

    C = stats.iloc[0]['param']
    estimated_score = dict(stats.iloc[0])

    print("Best regularization parameter so far: %f" % C)
    clf = svm.SVC(kernel='rbf', C=C).fit(features, targets)
    workingDir = config.getDataPath()
    storePath = os.path.join(workingDir, 'models')
    destination = os.path.join(storePath, str(uuid.uuid1()) + '.pkl')

    tmp_filename = tempfile.mktemp()
    joblib.dump(clf, tmp_filename, compress=3)
    with open(tmp_filename, 'rb') as fp:
        file_handler.upload_file_stream(destination, fp)
    model_id = mcoll.makeClassificationModel(pool_id=poolId,
                                             description=description,
                                             nId=nId,
                                             slices=slices_descriptor,
                                             estimated_score=estimated_score,
                                             path=destination,
                                             include_test_set=include_test)

    if out_model_id:
        with open(out_model_id, 'w') as f:
            f.write(str(model_id))
Example #7
0
def makePool(argv):
    args = parseArguments()
    if args.test_mode:
        if args.out:
            with open(args.out, 'w') as f:
                f.write('589cb3d60310e95ec7728f63')
        return

    tree = icoll.findChildren(args.target)
    print("Tree of positives:")
    icoll.printTree(tree, depth=1)

    sys.stdout.write("Fetching urls... ")
    sys.stdout.flush()
    image_urls_positives = icoll.getFlattenUrlsTree(tree)
    sys.stdout.write("Done\n")

    description = "%s %s" % (tree['NId'], tree['description'])
    poolId = pcoll.makePool(description)
    sys.stdout.write("Pool created with ID: %s\n" % poolId)

    image_urls_positives_size = len(image_urls_positives)
    preview_prob = float(PREVIEW_COUNT) / image_urls_positives_size
    print("Assigning positives")
    for index, row in zip(range(image_urls_positives_size),
                          image_urls_positives):
        icoll.assignToPool(row, poolId, 1, random.random() < preview_prob)
        progress_bar.printProgress(index + 1, image_urls_positives_size)

    parents = icoll.findParents(args.target)

    image_urls_negatives = []
    if len(parents) > 0:
        directParent = parents[0]
        print(("Direct parent node: %s %s" %
               (directParent['NId'], directParent['description'])))
        tree_negatives = icoll.findChildren(directParent['NId'],
                                            maxDepth=1,
                                            excludeNIds=[args.target])
        print("Negatives tree:")
        icoll.printTree(tree_negatives, depth=1)
        image_urls_negatives = icoll.getFlattenUrlsTree(tree_negatives)
        if len(image_urls_negatives) > image_urls_positives_size:
            image_urls_negatives = random.sample(image_urls_negatives,
                                                 image_urls_positives_size)
        print(("Added %d samples from sibling branches" %
               len(image_urls_negatives)))

    # Add extra random negatives
    for row in icoll.findRandomImageUrlsDocuments(image_urls_positives_size):
        image_urls_negatives.append(row)

    print("Assigning negatives")
    image_urls_negatives_size = len(image_urls_negatives)
    preview_prob = float(PREVIEW_COUNT) / image_urls_negatives_size
    for index, row in zip(range(image_urls_negatives_size),
                          image_urls_negatives):
        icoll.assignToPool(row, poolId, 0, random.random() < preview_prob)
        progress_bar.printProgress(index + 1, image_urls_negatives_size)

    if args.out:
        with open(args.out, 'w') as f:
            f.write(str(poolId))