Ejemplo n.º 1
0
Archivo: main.py Proyecto: kldcr/USN
def train(logger, args, example_idx):
    # load data
    embed, vocab, train_data, val_data, test_data = load_data(logger, args)
    args.embed_num = len(embed)
    args.embed_dim = len(embed[0])
    args.user_num = vocab.user_num
    args.product_num = vocab.product_num

    # data iter
    train_dataset = Dataset(train_data)
    val_dataset = Dataset(val_data)
    train_iter = DataLoader(dataset=train_dataset,
                            batch_size=args.batch_size,
                            shuffle=True,
                            drop_last=True)
    val_iter = DataLoader(dataset=val_dataset,
                          batch_size=args.batch_size,
                          shuffle=False,
                          drop_last=True)

    # define generator and discriminator in GAN
    gen = Generator(args, embed)

    # define GPU devices
    if args.use_cuda:
        gen = gen.cuda()
        gen = nn.DataParallel(gen, device_ids=args.gpu_id)

    # GENERATOR MLE TRAINING
    logger.info('Starting Generator MLE Training...')
    gen_optimizer = optim.Adam(gen.parameters(), lr=args.lr)
    train_generator_MLE(logger, args, gen, gen_optimizer,
                        args.MLE_TRAIN_EPOCHS, vocab, train_data, example_idx,
                        train_iter, val_iter)
Ejemplo n.º 2
0

if __name__ == '__main__':
    parser = ArgumentParser(description='Extract features of a dataset.',
        formatter_class=ArgumentDefaultsHelpFormatter)
    parser.add_argument('dataset',
        help='Path to the directory containing folders for each class that \
        contain the images and metadata')
    parser.add_argument('-i', '--image', action='store_true',
        help='Whether to extract visual features from the image')
    parser.add_argument('-t', '--textual', action='store_true',
        help='Whether to extract textual features; requires JSON metadata \
        files for all the images')
    parser.add_argument('-v', '--vocabulary', default='<dataset>/vocabulary.json',
        help='Filename of the JSON vocabulary used for bag of words')
    parser.add_argument('-o', '--output', default='<dataset>/features.json',
        help='Where to store the extracted features')
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    args.vocabulary = args.vocabulary.replace('<dataset>', args.dataset)

    assert args.image or args.textual, 'Need at least one feature source'
    if args.textual:
        assert args.vocabulary, 'Vocabulary is needed for textual features'
        WordsFeature.load_vocabulary(args.vocabulary)

    dataset = Dataset()
    dataset.read(args.dataset, args.image, args.textual)
    dataset.save(args.output)
def iterate_overall_texts(root):
    for directory in Dataset()._walk_directories(root):
        yield from iterate_texts(os.path.join(root, directory))
def iterate_texts(directory):
    for filename in Dataset()._walk_images(directory):
        metadata = get_metadata(os.path.join(directory, filename))
        text = preprocess_text(metadata['url'], metadata['title'],
                               metadata['description'])
        yield text
        '--limit',
        type=int,
        default=20,
        help='Maximal amount of words to display for each class')
    parser.add_argument(
        '-o',
        '--output',
        default='<dataset>/vocabulary.json',
        help='Filename of the JSON vocabulary that will be written')
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)

    text = iterate_overall_texts(args.dataset)
    overall = get_frequencies(text)

    vocabulary = {}
    for directory in Dataset()._walk_directories(args.dataset):
        print_headline(directory)
        texts = iterate_texts(os.path.join(args.dataset, directory))
        frequencies = get_frequencies(texts)
        frequencies = compute_tfidf(frequencies, overall)
        synonyms = list(get_top_frequencies(frequencies, args.limit).keys())
        vocabulary[directory] = synonyms
        print_frequencies(frequencies, args.limit)

    print('')
    print('Write vocabulary to', args.output)
    json.dump(vocabulary, open(args.output, 'w'))
    print('Done')
def classify_images(self, keywords=[], limit=25):
    """
    Worker task to retrieve a list of automatically categorized images from
    Wikimedia Commons from a given list of keywords.
    """
    if not len(keywords):
        raise AssertionError

    with app.app_context():

        def supported_extractors():
            extractors = []
            extractors.append(SizeFeature())
            extractors.append(ColorFeature())
            extractors.append(HistogramFeature())
            extractors.append(GradientFeature())
            extractors.append(FaceFeature(app.config['FACE_CLASSIFIER']))
            extractors.append(GeoFeature())
            extractors.append(FormatFeature())
            extractors.append(
                WordsFeature.create_from(app.config['WORDS_CONFIG']))
            return extractors

        def create_response_entry(label, sample):
            return {
                'thumbnail': sample.thumbnail,
                'image': sample.url,
                'label': label,
                'title': sample.url
            }

        def create_response(entries):
            return {'current': 100, 'total': 100, 'result': entries}

        # keep track of progress
        progress_observer = ProgressObserver(self)
        progress_observer.update(5)

        # query dpedia for related images based on given keywords
        if limit > app.config['QUERY_LIMIT']:
            limit = app.config['QUERY_LIMIT']
        searchterm = ' '.join(keywords)
        uris = fetch_uris_from_metadata(searchterm, limit, multiple=False)
        progress_observer.update(20)

        # download images and metadata into temp folder with unique task id
        temp_folder = os.path.join(app.config['DOWNLOAD_DIRECTORY'],
                                   classify_images.request.id)
        images_and_metadata(uris,
                            temp_folder,
                            False,
                            observer=progress_observer)
        progress_observer.update(80)

        # load dataset and extract features
        dataset = Dataset(logging=True)
        dataset.read(root=temp_folder,
                     extractors=supported_extractors(),
                     unlabeled_data=True)
        dataset_config = json.load(open(app.config['DATASET_CONFIG']))
        dataset.means = dataset_config['means']
        dataset.stds = dataset_config['stds']
        dataset.normalize()
        progress_observer.update(90)

        # predict labels using the trained classifier
        classifier = joblib.load(app.config['WIKIMEDIA_CLASSIFIER'])
        predictions = classifier.predict(dataset.data)
        progress_observer.update(95)

        # build response
        suggestions = []
        for index, sample in enumerate(dataset.samples):
            label = np.asscalar(predictions[index])
            entry = create_response_entry(label, sample)
            suggestions.append(entry)
        result = create_response(suggestions)

        # cleanup temporary directory
        delete_directory(temp_folder)

        progress_observer.update(100)

        return result
Ejemplo n.º 7
0
def read_samples(root):
    samples = Dataset()._read_samples(root)
    return samples
Ejemplo n.º 8
0
    args = parser.parse_args()

    args.output = args.output.replace('<dataset>', args.dataset)
    assert args.visual or args.textual, 'Need at least one feature source'

    extractors = []
    if args.visual:
        extractors.append(SizeFeature())
        extractors.append(ColorFeature())
        extractors.append(HistogramFeature())
        extractors.append(GradientFeature())
        # extractors.append(BlobFeature())
        # extractors.append(BriefFeature())
        if os.path.isfile(args.trained_faces):
            extractors.append(FaceFeature(args.trained_faces))
        else:
            print('Skip face feature since the trained face detector was not '
                  'found at {}.'.format(args.trained_faces))
    if args.textual:
        samples = read_samples(args.dataset)
        if os.path.isfile(args.stopwords):
            args.stopwords = open(args.stopwords)
        extractors.append(GeoFeature())
        extractors.append(FormatFeature())
        extractors.append(WordsFeature(samples, args.stopwords))
        # extractors.append(RandomFeature())

    dataset = Dataset(logging=True)
    dataset.read(args.dataset, extractors)
    dataset.save(args.output)