def train(logger, args, example_idx): # load data embed, vocab, train_data, val_data, test_data = load_data(logger, args) args.embed_num = len(embed) args.embed_dim = len(embed[0]) args.user_num = vocab.user_num args.product_num = vocab.product_num # data iter train_dataset = Dataset(train_data) val_dataset = Dataset(val_data) train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) val_iter = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) # define generator and discriminator in GAN gen = Generator(args, embed) # define GPU devices if args.use_cuda: gen = gen.cuda() gen = nn.DataParallel(gen, device_ids=args.gpu_id) # GENERATOR MLE TRAINING logger.info('Starting Generator MLE Training...') gen_optimizer = optim.Adam(gen.parameters(), lr=args.lr) train_generator_MLE(logger, args, gen, gen_optimizer, args.MLE_TRAIN_EPOCHS, vocab, train_data, example_idx, train_iter, val_iter)
if __name__ == '__main__': parser = ArgumentParser(description='Extract features of a dataset.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('dataset', help='Path to the directory containing folders for each class that \ contain the images and metadata') parser.add_argument('-i', '--image', action='store_true', help='Whether to extract visual features from the image') parser.add_argument('-t', '--textual', action='store_true', help='Whether to extract textual features; requires JSON metadata \ files for all the images') parser.add_argument('-v', '--vocabulary', default='<dataset>/vocabulary.json', help='Filename of the JSON vocabulary used for bag of words') parser.add_argument('-o', '--output', default='<dataset>/features.json', help='Where to store the extracted features') args = parser.parse_args() args.output = args.output.replace('<dataset>', args.dataset) args.vocabulary = args.vocabulary.replace('<dataset>', args.dataset) assert args.image or args.textual, 'Need at least one feature source' if args.textual: assert args.vocabulary, 'Vocabulary is needed for textual features' WordsFeature.load_vocabulary(args.vocabulary) dataset = Dataset() dataset.read(args.dataset, args.image, args.textual) dataset.save(args.output)
best = max(scores) return worst, average, best if __name__ == '__main__': parser = ArgumentParser(description='Learning algorithm used to classify ' 'images.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('features', help='Path to the JSON file containing extracted features of the ' 'dataset') parser.add_argument('-s', '--split', type=float, default=0.25, help='Fraction of data used for validation') parser.add_argument('-c', '--copy-predicted', default='<folder>/../<folder>-predicted/', help='Folder to copy predicted images into; sub directories for all ' 'labels are created; <folder> is the directory of the features file') args = parser.parse_args() if '<folder>' in args.copy_predicted: folder = os.path.splitext(args.features)[0] args.copy_predicted = args.copy_predicted.replace('<folder>', folder) dataset = Dataset() dataset.load(args.features) classifier = RandomForestClassifier(n_estimators=300) prediction = train_and_predict(classifier, dataset, args.split) prediction.print_scores() prediction.plot_confusion_matrix()
assert len(captions) == len(data) assert all(len(x) == len(data[0]) for x in data) with open(filename, 'w') as csv: csv.write(','.join(captions) + '\n') for row in range(len(data[0])): csv.write(','.join(str(column[row]) for column in data) + '\n') if __name__ == '__main__': parser = ArgumentParser(description='Measure statistics of features \ within the images of the same class to evaluate features.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('features', help='Path to the JSON file containing extracted features of the \ dataset') parser.add_argument('-o', '--output', default='<folder>/evaluation.csv', help='Filename of the CSV file where p-values will be written to; \ <folder> is the directory of the features file') args = parser.parse_args() folder = os.path.splitext(os.path.split(args.features)[0])[0] args.output = args.output.replace('<folder>', folder) dataset = Dataset() dataset.load(args.features) print_chi(dataset) print('Write CSV table to', args.output) write_chi(args.output, dataset) print('Done')
def iterate_overall_texts(root): for directory in Dataset()._walk_directories(root): yield from iterate_texts(os.path.join(root, directory))
def iterate_texts(directory): for filename in Dataset()._walk_images(directory): metadata = get_metadata(os.path.join(directory, filename)) text = preprocess_text(metadata['url'], metadata['title'], metadata['description']) yield text
'--limit', type=int, default=20, help='Maximal amount of words to display for each class') parser.add_argument( '-o', '--output', default='<dataset>/vocabulary.json', help='Filename of the JSON vocabulary that will be written') args = parser.parse_args() args.output = args.output.replace('<dataset>', args.dataset) text = iterate_overall_texts(args.dataset) overall = get_frequencies(text) vocabulary = {} for directory in Dataset()._walk_directories(args.dataset): print_headline(directory) texts = iterate_texts(os.path.join(args.dataset, directory)) frequencies = get_frequencies(texts) frequencies = compute_tfidf(frequencies, overall) synonyms = list(get_top_frequencies(frequencies, args.limit).keys()) vocabulary[directory] = synonyms print_frequencies(frequencies, args.limit) print('') print('Write vocabulary to', args.output) json.dump(vocabulary, open(args.output, 'w')) print('Done')
def classify_images(self, keywords=[], limit=25): """ Worker task to retrieve a list of automatically categorized images from Wikimedia Commons from a given list of keywords. """ if not len(keywords): raise AssertionError with app.app_context(): def supported_extractors(): extractors = [] extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) extractors.append(FaceFeature(app.config['FACE_CLASSIFIER'])) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append( WordsFeature.create_from(app.config['WORDS_CONFIG'])) return extractors def create_response_entry(label, sample): return { 'thumbnail': sample.thumbnail, 'image': sample.url, 'label': label, 'title': sample.url } def create_response(entries): return {'current': 100, 'total': 100, 'result': entries} # keep track of progress progress_observer = ProgressObserver(self) progress_observer.update(5) # query dpedia for related images based on given keywords if limit > app.config['QUERY_LIMIT']: limit = app.config['QUERY_LIMIT'] searchterm = ' '.join(keywords) uris = fetch_uris_from_metadata(searchterm, limit, multiple=False) progress_observer.update(20) # download images and metadata into temp folder with unique task id temp_folder = os.path.join(app.config['DOWNLOAD_DIRECTORY'], classify_images.request.id) images_and_metadata(uris, temp_folder, False, observer=progress_observer) progress_observer.update(80) # load dataset and extract features dataset = Dataset(logging=True) dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True) dataset_config = json.load(open(app.config['DATASET_CONFIG'])) dataset.means = dataset_config['means'] dataset.stds = dataset_config['stds'] dataset.normalize() progress_observer.update(90) # predict labels using the trained classifier classifier = joblib.load(app.config['WIKIMEDIA_CLASSIFIER']) predictions = classifier.predict(dataset.data) progress_observer.update(95) # build response suggestions = [] for index, sample in enumerate(dataset.samples): label = np.asscalar(predictions[index]) entry = create_response_entry(label, sample) suggestions.append(entry) result = create_response(suggestions) # cleanup temporary directory delete_directory(temp_folder) progress_observer.update(100) return result
def read_samples(root): samples = Dataset()._read_samples(root) return samples
args = parser.parse_args() args.output = args.output.replace('<dataset>', args.dataset) assert args.visual or args.textual, 'Need at least one feature source' extractors = [] if args.visual: extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) # extractors.append(BlobFeature()) # extractors.append(BriefFeature()) if os.path.isfile(args.trained_faces): extractors.append(FaceFeature(args.trained_faces)) else: print('Skip face feature since the trained face detector was not ' 'found at {}.'.format(args.trained_faces)) if args.textual: samples = read_samples(args.dataset) if os.path.isfile(args.stopwords): args.stopwords = open(args.stopwords) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append(WordsFeature(samples, args.stopwords)) # extractors.append(RandomFeature()) dataset = Dataset(logging=True) dataset.read(args.dataset, extractors) dataset.save(args.output)
def classify_images(self, keywords=[], limit=25): """ Worker task to retrieve a list of automatically categorized images from Wikimedia Commons from a given list of keywords. """ if not len(keywords): raise AssertionError with app.app_context(): def supported_extractors(): extractors = [] extractors.append(SizeFeature()) extractors.append(ColorFeature()) extractors.append(HistogramFeature()) extractors.append(GradientFeature()) extractors.append(FaceFeature(app.config["FACE_CLASSIFIER"])) extractors.append(GeoFeature()) extractors.append(FormatFeature()) extractors.append(WordsFeature.create_from(app.config["WORDS_CONFIG"])) return extractors def create_response_entry(label, sample): return {"thumbnail": sample.thumbnail, "image": sample.url, "label": label, "title": sample.url} def create_response(entries): return {"current": 100, "total": 100, "result": entries} # keep track of progress progress_observer = ProgressObserver(self) progress_observer.update(5) # query dpedia for related images based on given keywords if limit > app.config["QUERY_LIMIT"]: limit = app.config["QUERY_LIMIT"] searchterm = " ".join(keywords) uris = fetch_uris_from_metadata(searchterm, limit, multiple=False) progress_observer.update(20) # download images and metadata into temp folder with unique task id temp_folder = os.path.join(app.config["DOWNLOAD_DIRECTORY"], classify_images.request.id) images_and_metadata(uris, temp_folder, False, observer=progress_observer) progress_observer.update(80) # load dataset and extract features dataset = Dataset(logging=True) dataset.read(root=temp_folder, extractors=supported_extractors(), unlabeled_data=True) dataset_config = json.load(open(app.config["DATASET_CONFIG"])) dataset.means = dataset_config["means"] dataset.stds = dataset_config["stds"] dataset.normalize() progress_observer.update(90) # predict labels using the trained classifier classifier = joblib.load(app.config["WIKIMEDIA_CLASSIFIER"]) predictions = classifier.predict(dataset.data) progress_observer.update(95) # build response suggestions = [] for index, sample in enumerate(dataset.samples): label = np.asscalar(predictions[index]) entry = create_response_entry(label, sample) suggestions.append(entry) result = create_response(suggestions) # cleanup temporary directory delete_directory(temp_folder) progress_observer.update(100) return result
assert all(len(x) == len(data[0]) for x in data) with open(filename, 'w') as csv: csv.write(','.join(captions) + '\n') for row in range(len(data[0])): csv.write(','.join(str(column[row]) for column in data) + '\n') if __name__ == '__main__': parser = ArgumentParser(description='Measure statistics of features ' 'within the images of the same class to evaluate features.', formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('features', help='Path to the JSON file containing extracted features of the ' 'dataset') parser.add_argument('-o', '--output', default='<folder>/evaluation.csv', help='Filename of the CSV file where p-values will be written to; ' '<folder> is the directory of the features file') args = parser.parse_args() folder = os.path.splitext(os.path.split(args.features)[0])[0] args.output = args.output.replace('<folder>', folder) dataset = Dataset() dataset.load(args.features) dataset.normalize() print_chi(dataset) print('Write CSV table to', args.output) write_chi(args.output, dataset) print('Done')