Beispiel #1
0

if __name__ == '__main__':
    parser = ArgumentParser(description='Extract features of a dataset.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('dataset',
        help='Path to the directory containing folders for each class that \
        contain the images and metadata')
    parser.add_argument('-i', '--image', action='store_true',
        help='Whether to extract visual features from the image')
    parser.add_argument('-t', '--textual', action='store_true',
        help='Whether to extract textual features; requires JSON metadata \
        files for all the images')
    parser.add_argument('-v', '--vocabulary', default='<dataset>/vocabulary.json',
        help='Filename of the JSON vocabulary used for bag of words')
    parser.add_argument('-o', '--output', default='<dataset>/features.json',
        help='Where to store the extracted features')
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    args.vocabulary = args.vocabulary.replace('<dataset>', args.dataset)

    assert args.image or args.textual, 'Need at least one feature source'
    if args.textual:
        assert args.vocabulary, 'Vocabulary is needed for textual features'
        WordsFeature.load_vocabulary(args.vocabulary)

    dataset = Dataset()
    dataset.read(args.dataset, args.image, args.textual)
    dataset.save(args.output)
def classify_images(self, keywords=[], limit=25):
    """
    Worker task to retrieve a list of automatically categorized images from
    Wikimedia Commons from a given list of keywords.
    """
    if not len(keywords):
        raise AssertionError

    with app.app_context():

        def supported_extractors():
            extractors = []
            extractors.append(SizeFeature())
            extractors.append(ColorFeature())
            extractors.append(HistogramFeature())
            extractors.append(GradientFeature())
            extractors.append(FaceFeature(app.config['FACE_CLASSIFIER']))
            extractors.append(GeoFeature())
            extractors.append(FormatFeature())
            extractors.append(
                WordsFeature.create_from(app.config['WORDS_CONFIG']))
            return extractors

        def create_response_entry(label, sample):
            return {
                'thumbnail': sample.thumbnail,
                'image': sample.url,
                'label': label,
                'title': sample.url
            }

        def create_response(entries):
            return {'current': 100, 'total': 100, 'result': entries}

        # keep track of progress
        progress_observer = ProgressObserver(self)
        progress_observer.update(5)

        # query dpedia for related images based on given keywords
        if limit > app.config['QUERY_LIMIT']:
            limit = app.config['QUERY_LIMIT']
        searchterm = ' '.join(keywords)
        uris = fetch_uris_from_metadata(searchterm, limit, multiple=False)
        progress_observer.update(20)

        # download images and metadata into temp folder with unique task id
        temp_folder = os.path.join(app.config['DOWNLOAD_DIRECTORY'],
                                   classify_images.request.id)
        images_and_metadata(uris,
                            temp_folder,
                            False,
                            observer=progress_observer)
        progress_observer.update(80)

        # load dataset and extract features
        dataset = Dataset(logging=True)
        dataset.read(root=temp_folder,
                     extractors=supported_extractors(),
                     unlabeled_data=True)
        dataset_config = json.load(open(app.config['DATASET_CONFIG']))
        dataset.means = dataset_config['means']
        dataset.stds = dataset_config['stds']
        dataset.normalize()
        progress_observer.update(90)

        # predict labels using the trained classifier
        classifier = joblib.load(app.config['WIKIMEDIA_CLASSIFIER'])
        predictions = classifier.predict(dataset.data)
        progress_observer.update(95)

        # build response
        suggestions = []
        for index, sample in enumerate(dataset.samples):
            label = np.asscalar(predictions[index])
            entry = create_response_entry(label, sample)
            suggestions.append(entry)
        result = create_response(suggestions)

        # cleanup temporary directory
        delete_directory(temp_folder)

        progress_observer.update(100)

        return result
def classify_images(self, keywords=[], limit=25):
    """
    Worker task to retrieve a list of automatically categorized images from
    Wikimedia Commons from a given list of keywords.
    """
    if not len(keywords):
        raise AssertionError

    with app.app_context():

        def supported_extractors():
            extractors = []
            extractors.append(SizeFeature())
            extractors.append(ColorFeature())
            extractors.append(HistogramFeature())
            extractors.append(GradientFeature())
            extractors.append(FaceFeature(app.config["FACE_CLASSIFIER"]))
            extractors.append(GeoFeature())
            extractors.append(FormatFeature())
            extractors.append(WordsFeature.create_from(app.config["WORDS_CONFIG"]))
            return extractors

        def create_response_entry(label, sample):
            return {"thumbnail": sample.thumbnail, "image": sample.url, "label": label, "title": sample.url}

        def create_response(entries):
            return {"current": 100, "total": 100, "result": entries}

        # keep track of progress
        progress_observer = ProgressObserver(self)
        progress_observer.update(5)

        # query dpedia for related images based on given keywords
        if limit > app.config["QUERY_LIMIT"]:
            limit = app.config["QUERY_LIMIT"]
        searchterm = " ".join(keywords)
        uris = fetch_uris_from_metadata(searchterm, limit, multiple=False)
        progress_observer.update(20)

        # download images and metadata into temp folder with unique task id
        temp_folder = os.path.join(app.config["DOWNLOAD_DIRECTORY"], classify_images.request.id)
        images_and_metadata(uris, temp_folder, False, observer=progress_observer)
        progress_observer.update(80)

        # load dataset and extract features
        dataset = Dataset(logging=True)
        dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True)
        dataset_config = json.load(open(app.config["DATASET_CONFIG"]))
        dataset.means = dataset_config["means"]
        dataset.stds = dataset_config["stds"]
        dataset.normalize()
        progress_observer.update(90)

        # predict labels using the trained classifier
        classifier = joblib.load(app.config["WIKIMEDIA_CLASSIFIER"])
        predictions = classifier.predict(dataset.data)
        progress_observer.update(95)

        # build response
        suggestions = []
        for index, sample in enumerate(dataset.samples):
            label = np.asscalar(predictions[index])
            entry = create_response_entry(label, sample)
            suggestions.append(entry)
        result = create_response(suggestions)

        # cleanup temporary directory
        delete_directory(temp_folder)

        progress_observer.update(100)

        return result
Beispiel #4
0
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    assert args.visual or args.textual, 'Need at least one feature source'

    extractors = []
    if args.visual:
        extractors.append(SizeFeature())
        extractors.append(ColorFeature())
        extractors.append(HistogramFeature())
        extractors.append(GradientFeature())
        # extractors.append(BlobFeature())
        # extractors.append(BriefFeature())
        if os.path.isfile(args.trained_faces):
            extractors.append(FaceFeature(args.trained_faces))
        else:
            print('Skip face feature since the trained face detector was not '
                  'found at {}.'.format(args.trained_faces))
    if args.textual:
        samples = read_samples(args.dataset)
        if os.path.isfile(args.stopwords):
            args.stopwords = open(args.stopwords)
        extractors.append(GeoFeature())
        extractors.append(FormatFeature())
        extractors.append(WordsFeature(samples, args.stopwords))
        # extractors.append(RandomFeature())

    dataset = Dataset(logging=True)
    dataset.read(args.dataset, extractors)
    dataset.save(args.output)
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    assert args.visual or args.textual, 'Need at least one feature source'

    extractors = []
    if args.visual:
        extractors.append(SizeFeature())
        extractors.append(ColorFeature())
        extractors.append(HistogramFeature())
        extractors.append(GradientFeature())
        # extractors.append(BlobFeature())
        # extractors.append(BriefFeature())
        if os.path.isfile(args.trained_faces):
            extractors.append(FaceFeature(args.trained_faces))
        else:
            print('Skip face feature since the trained face detector was not '
                'found at {}.'.format(args.trained_faces))
    if args.textual:
        samples = read_samples(args.dataset)
        if os.path.isfile(args.stopwords):
            args.stopwords = open(args.stopwords)
        extractors.append(GeoFeature())
        extractors.append(FormatFeature())
        extractors.append(WordsFeature(samples, args.stopwords))
        # extractors.append(RandomFeature())

    dataset = Dataset(logging=True)
    dataset.read(args.dataset, extractors)
    dataset.save(args.output)