def main(): # modeling photos modeling_photos() # modeling users # modeling_users() # print('Finished.') logger.write('Finished.' + '\n')
def train_model(data: TData, epochs: int, batch_size: int, lr: float) -> TModel: l.write('# Setting Up Data') l.write(f'Training example count: {len(data)}') encoding = BOWEncoding(data, min_word_freq=5) encoding.prepare() dataset = WordTokenDataset(data, encoding) dataset.prepare() l.write('# Training') data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, collate_fn=data_utils.collate_samples) model = Model(vocab_size=encoding.vocab_size, n_classes=encoding.n_classes()) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) for epoch in range(epochs): epoch_total_loss = 0 epoch_progress = l.progressbar(key=f'epoch-{epoch}', name=f'Training Epoch {epoch + 1}') epoch_progress.show() batch_count = len(data_loader) for i, samples in enumerate(data_loader): optimizer.zero_grad() output = model(samples) loss = criterion(output, samples.label) loss.backward() optimizer.step() epoch_progress.set_progress((i + 1) / float(batch_count)) epoch_total_loss += loss.item() # Log the accuracy on predicting the first x examples. samples = dataset[:10000] predictions = model.predict(samples) labels = samples.label total = len(labels) correct = torch.sum(labels == predictions) l.write(f'Accuracy: {float(correct)/total*100:.02f}%.') l.write(f'Training Loss: {epoch_total_loss}') return model
def main(): if not os.path.exists(CLEAN_DATA_PATH): os.makedirs(CLEAN_DATA_PATH) build_photo_examples(os.path.join(RAW_DATA_PATH, DATASET_TRAIN_FACE), os.path.join(RAW_DATA_PATH, DATASET_TRAIN_TEXT), os.path.join(CLEAN_DATA_PATH, 'train_photo_examples')) build_photo_examples(os.path.join(RAW_DATA_PATH, DATASET_TEST_FACE), os.path.join(RAW_DATA_PATH, DATASET_TEST_TEXT), os.path.join(CLEAN_DATA_PATH, 'test_photo_examples')) # print('Finished.') logger.write('Finished.' + '\n')
def train_multiple(hyperparams_list, train_dataset, valid_dataset, encoding, epochs): models = [] train_losses_list = [] valid_losses = [] for i, hyperparams in enumerate(hyperparams_list): l.write(f'## Model {i+1} / {len(hyperparams_list)}...') start_time = time.time() batch_size = hyperparams['batch_size'] lr = hyperparams['lr'] # 1. Setup Data Loader data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_utils.collate_samples) # 2. Create the Model model = Model(vocab_size=encoding.vocab_size, n_classes=encoding.n_classes()) # 3. Setup Criterion and Optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 4. Train the Model train_losses = train(model, i, criterion, optimizer, train_dataset, data_loader, epochs) # 5. Calculate Validation Loss with torch.no_grad(): valid_samples = valid_dataset[:] outputs = model(valid_samples) valid_loss = criterion(outputs, valid_samples.label) valid_losses.append(valid_loss) end_time = time.time() models.append(model) train_losses_list.append(train_losses) l.write(f'Model completed in {(end_time - start_time)/60:.02f}m.\n') return models, train_losses_list, valid_losses
def train(model, model_idx, criterion, optimizer, dataset, data_loader, epochs, should_log=True): train_losses = [] log_every = 1 train_loss_estimator_size = 10000 for epoch in range(epochs): losses = [] epoch_progress = l.progressbar(key=f'model_{model_idx}_epoch{epoch}', name=f'Training Epoch {epoch + 1}') epoch_progress.show() batch_count = len(data_loader) for i, samples in enumerate(data_loader): optimizer.zero_grad() output = model(samples) loss = criterion(output, samples.label) loss.backward() optimizer.step() losses.append(loss.item()) epoch_progress.set_progress((i + 1) / float(batch_count)) train_loss = np.mean(losses) train_losses.append(train_loss) if should_log and (epoch + 1) % log_every == 0: train_loss_estimator_start = max( 1, len(dataset) - train_loss_estimator_size) random_start = torch.randint(high=train_loss_estimator_start, size=(1, )).item() samples = dataset[random_start:(random_start + train_loss_estimator_size)] predictions = model.predict(samples) labels = samples.label total = len(labels) correct = torch.sum(labels == predictions) l.write(f'Accuracy: {float(correct)/total*100:.02f}%.') l.write(f'Training Loss: {train_loss.item()}\n') return train_losses
def pick_best_model(data: TData, models: List[TModel]) -> TModel: l.write('# Loading Data') encoding = BOWEncoding(data, min_word_freq=5) encoding.prepare() dataset = WordTokenDataset(data, encoding) dataset.prepare() valid_accuracies = [] l.write('# Calculating Accuracies') samples = dataset[:] labels = samples.label for i, model in enumerate(models): l.write(f'Calculating accuracy for Model {i+1}') predictions = model.predict(samples) total = len(samples) correct = torch.sum(predictions == labels).item() accuracy = float(correct) / total valid_accuracies.append(accuracy) highest_accuracy = max(valid_accuracies) highest_accuracy_idx = valid_accuracies.index(highest_accuracy) best_model = models[highest_accuracy_idx] l.write(f'Best accuracy: {highest_accuracy*100:.02f}%') return best_model
def main(): K1s = [10, 30, 100, 300, 1000] num_process_stats = list() for K1 in K1s: num_process_stats.append( build_pop_examples( os.path.join(preprocessing_photos.RAW_DATA_PATH, preprocessing_photos.DATASET_TRAIN_INTERACTION), os.path.join(preprocessing_photos.DATA_HOUSE_PATH, 'photo-{}.pkl'.format(K1)), os.path.join(preprocessing_photos.CLEAN_DATA_PATH, 'pop_examples-{}.txt'.format(K1)))) # print('Examples building finished.') logger.write('Examples building finished.' + '\n') for tup in num_process_stats: # print('interacts #total: {}, #filtered for missing: {}, #filtered for pic: {}, #users: {}'.format(tup[0], tup[1], tup[2], tup[3])) logger.write( 'interacts #total: {}, #filtered for missing in trained photos: {}, #filtered for duration time is 0: {}; #users for preferences: {}' .format(tup[0], tup[1], tup[2], tup[3]) + '\n')
def store(example_filename, NUM_TEXT_FEATURE, photos_id, face_info, text_info_photos): cnt = 0 num_unfound_photo = 0 examples = np.zeros(shape=(len(photos_id), 1 + NUM_FACE_FEATURE + NUM_TEXT_FEATURE), dtype=np.float32) examples[:, 0] = list(photos_id) for exam_idx, photo_id in enumerate(photos_id): if cnt % 10000 == 0: print('Generating {}: {}'.format(example_filename, cnt)) if photo_id in face_info.keys(): examples[exam_idx, 1: NUM_FACE_FEATURE + 1] = face_info[photo_id] if photo_id in text_info_photos: topic = photo_topic_map[photo_id] if NUM_TEXT_FEATURE == 1: examples[exam_idx, NUM_FACE_FEATURE + 1:] = [topic] else: idx = common_word_idx_map[topic] if topic in common_word_idx_map.keys() else 0 examples[exam_idx, NUM_FACE_FEATURE + 1:] = embeddings[idx] else: num_unfound_photo += 1 cnt += 1 np.save(example_filename, examples) logger.write('#Unfound photo: {}'.format(num_unfound_photo) + '\n')
def build_photo_examples(face_filename, text_filename, example_filename_prefix): print() photos_id = set() # integers face_info = dict() # {photo_id: integer, features: []} cnt = 0 with open(face_filename, 'r') as face_file: for line in face_file: cnt += 1 if cnt % 10000 == 0: print('Processing {}: {}'.format(face_filename, cnt)) line = line.strip() segs = line.split(maxsplit=1) if len(segs) == 2: photo_id = int(segs[0]) faces_list = json.loads(segs[1]) if type(faces_list) is list: faces = np.array(faces_list, dtype=np.float32) num_face = faces.shape[0] face_occu = np.sum(faces[:, 0]) gender_pref = np.mean(faces[:, 1]) age = np.mean(faces[:, 2]) looking = np.mean(faces[:, 3]) face_info[photo_id] = [num_face, face_occu, gender_pref, age, looking] photos_id.add(photo_id) # print('#photos with face info = {}'.format(len(face_info))) logger.write('#photos with face info = {}'.format(len(face_info)) + '\n') text_info_photos = set() # integers if text_filename is not None: cnt = 0 with open(text_filename, 'r') as text_file: for line in text_file: cnt += 1 if cnt % 10000 == 0: print('Processing {}: {}'.format(text_filename, cnt)) line = line.strip() segs = line.split(maxsplit=1) if len(segs) == 2: photo_id = int(segs[0]) text_info_photos.add(photo_id) photos_id.add(photo_id) # print('#photos with text info = {}'.format(len(text_info_photos))) logger.write('#photos with text info = {}'.format(len(text_info_photos)) + '\n') # print('#photos in total = {}'.format(len(photos_id))) logger.write('#photos in total = {}'.format(len(photos_id)) + '\n') store(example_filename_prefix + '-topic.npy', 1, photos_id, face_info, text_info_photos) store(example_filename_prefix + '.npy', embeddings.shape[1], photos_id, face_info, text_info_photos)
def recommend(sub_prefix): print('Loading moldes...') photo_model_prefix = 'photo-' pop_examples_prefix = 'pop_examples-' magicians = list() for file in os.listdir(preprocessing_photos.DATA_HOUSE_PATH): if file.startswith(photo_model_prefix): photo_kmeans = joblib.load(os.path.join(preprocessing_photos.DATA_HOUSE_PATH, file)) photo_kmeans.verbose = 0 first_sep = file.index('-') second_sep = file.rindex('.') K1 = int(file[first_sep + 1: second_sep]) pop_examples = np.loadtxt(os.path.join(preprocessing_photos.CLEAN_DATA_PATH, pop_examples_prefix + str(K1) + '.txt'), delimiter=',') if len(pop_examples.shape) == 1: pop_examples = pop_examples.reshape(-1, pop_examples.shape[0]) magicians.append(Magician(photo_kmeans, K1, pop_examples)) print('{} models loaded.'.format(len(magicians))) # sorting models by multiplication of inertia magicians.sort(key=attrgetter('total_inertia')) for magician in magicians: print(str(magician)) print('#photo_cate_map={}\n'.format(len(magician.photo_cate_map))) # normalization print('Normalizing dataset...') photo_examples = np.load(os.path.join(preprocessing_photos.CLEAN_DATA_PATH, 'test_photo_examples.npy')) train_photo_examples = np.load(os.path.join(preprocessing_photos.CLEAN_DATA_PATH, 'train_photo_examples.npy')) scaler = MinMaxScaler() scaler.fit(train_photo_examples[:, 1:]) data = scaler.transform(photo_examples[:, 1:]) photo_idx_map = dict(zip(np.array(photo_examples[:, 0], dtype=int), range(photo_examples.shape[0]))) del train_photo_examples # inference print('Inferring..') magician_predicts_map = dict() predict_data = pd.read_csv(os.path.join(preprocessing_photos.RAW_DATA_PATH, preprocessing_photos.DATASET_TEST_INTERACTION), delim_whitespace=True, header=None, names=['user_id', 'photo_id', 'time', 'duration_time']) logger.write('Predict data size: {}'.format(predict_data.shape[0]) + '\n') # os.path.join(preprocessing_photos.DATA_HOUSE_PATH, sub_prefix + '-' + str(rank) + '_' + magician.name) for magician in magicians: magician_predicts_map[magician.name] = np.ndarray(shape=(predict_data.shape[0]), dtype=np.float32) tot_cnt = 0 cnt_unk_photo = 0 cnt_existed_photo = 0 cnt_predict_photo = 0 cnt_new_user = 0 for i in range(predict_data.shape[0]): user_id = predict_data.loc[i, 'user_id'] photo_id = predict_data.loc[i, 'photo_id'] for magician in magicians: tot_cnt += 1 if user_id not in magician.user_matrix_map: click_probability = max(magician.fashion) cnt_new_user += 1 else: if photo_id in magician.photo_cate_map.keys(): cate_id = magician.photo_cate_map[photo_id] cnt_existed_photo += 1 elif photo_id in photo_idx_map.keys(): # Almost examples should hit here. features = data[photo_idx_map[photo_id]] cate_id = magician.photo_kmeans.predict(np.array([features]))[0] cnt_predict_photo += 1 else: # No example should hit here. cate_id = None cnt_unk_photo += 1 if cate_id is None: click_probability = 0.0 else: matrix_idx = magician.user_matrix_map[user_id] click_probability = magician.matrix[matrix_idx, cate_id] magician_predicts_map[magician.name][i] = click_probability if i % 10000 == 0: print('Predicted examples: {}'.format(i)) # print('#new users={}, #existed={}, #predict={}, #unknown={}, #total={}\n' # .format(cnt_new_user, cnt_existed_photo, cnt_predict_photo, cnt_unk_photo, tot_cnt) # ) logger.write('#new users={}, #existed={}, #predict={}, #new photos beyond train and test dataset={}, #total={}\n' .format(cnt_new_user, cnt_existed_photo, cnt_predict_photo, cnt_unk_photo, tot_cnt)) print('Saving prediction...') for rank, magician in enumerate(magicians): predict_data['click_prob'] = magician_predicts_map[magician.name] predict_data.to_csv(os.path.join(preprocessing_photos.DATA_HOUSE_PATH, sub_prefix + '-' + str(rank) + '_' + magician.name), columns=['user_id', 'photo_id', 'click_prob'], sep='\t', header=False, index=False, float_format='%.6f')
def main(sub_prefix): recommend(sub_prefix) print('Finished.') logger.write('Finished.' + '\n')
def main(): l.write('# Loading Data') with s3_read('ml/data/news_classifier/train_data.json') as file: data = pd.read_json(file, orient='records') data = data.sample(frac=1) # Shuffle the data. l.write(f'Training example count: {len(data)}') train_test_split = 0.95 split_idx = math.floor(len(data) * train_test_split) train_data = data.iloc[0:split_idx] valid_data = data.iloc[split_idx:] encoding = BOWEncoding(data, min_word_freq=5) encoding.prepare() train_dataset = WordTokenDataset(train_data, encoding) train_dataset.prepare() valid_dataset = WordTokenDataset(valid_data, encoding) valid_dataset.prepare() l.write('# Training') hyperparams_list = [ { 'batch_size': 100, 'lr': 1e-3 }, { 'batch_size': 10, 'lr': 1e-3 }, { 'batch_size': 100, 'lr': 1e-2 }, { 'batch_size': 10, 'lr': 1e-2 }, ] models, train_loss_list, valid_losses = train_multiple(hyperparams_list, train_dataset, valid_dataset, encoding, epochs=EPOCHS) l.write('# Viewing Results') best_model_idx = torch.argmin(torch.FloatTensor(valid_losses)).item() best_model = models[best_model_idx] l.write(f'Best Model: {best_model_idx+1}') valid_samples = valid_dataset[:] predictions = best_model.predict(valid_samples) total = len(valid_samples.label) correct = torch.sum(predictions == valid_samples.label) accuracy = float(correct) / total l.write(f'Accuracy of Best Model: {accuracy*100:.02f}%.') confusion_matrix, category_encoder = create_confusion_matrix( valid_samples.label, predictions) category_decoder = {i: c for c, i in category_encoder.items()} labeling_errors = top_k_labeling_errors(confusion_matrix, category_decoder, k=5) label_decoder = {i: l for l, i in encoding._label_encoder.items()} # Looking at the most frequent labeling errors. for i, error in enumerate(labeling_errors): error_0 = label_decoder[error[0]] error_1 = label_decoder[error[1]] l.write(f'{i+1}. "{error_0}" confused for "{error_1}"') l.write('# Persisting Model') with s3_write('ml/models/news_classifier/bow_model.torch', 'b') as file: torch.save(best_model.state_dict(), file)
def main(): l.write('# Loading and Setting Up Data') l.write('Loading Training Data') with s3_read('ml/data/news_classifier/train_data.json') as file: data = pd.read_json(file, orient="records") data = data[:1000] l.write('Loading embeddings') with s3_read('ml/glove_embeddings/glove.6B.100d.txt') as file: embeddings = data_utils.load_embeddings(file, embedding_dim=100) l.write('Preparing data') train_test_split = 0.95 split_idx = math.floor(len(data) * train_test_split) train_data = data.iloc[0:split_idx] valid_data = data.iloc[split_idx:] encoding = WordEmbeddingEncoding(data, embeddings) encoding.prepare() train_dataset = WordTokenDataset(train_data, encoding) train_dataset.prepare() valid_dataset = WordTokenDataset(valid_data, encoding) valid_dataset.prepare() print('# Training the Model') hyperparams_list = [ { 'weighting': 'uniform', 'lr': 0.001, 'batch_size': 100 }, { 'weighting': 'uniform', 'lr': 0.01, 'batch_size': 100 }, { 'weighting': 'uniform', 'lr': 0.001, 'batch_size': 50 }, { 'weighting': 'uniform', 'lr': 0.01, 'batch_size': 50 }, ] models = [] train_losses_list = [] valid_losses = [] accepted_tokens = {t for t in embeddings.index} for i, hyperparams in enumerate(hyperparams_list): l.write(f'Model {i+1} / {len(hyperparams_list)}') start_time = time() batch_size = hyperparams['batch_size'] lr = hyperparams['lr'] weighting = hyperparams['weighting'] # 1. Setup Data Loader data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, collate_fn=data_utils.collate_samples) # 2. Create the Model model = Model(embeddings=embeddings, n_classes=encoding.n_classes(), weighting=weighting) # 3. Setup Criterion and Optimizer criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=lr) # 4. Train the Model train_losses = train(model, criterion, optimizer, train_dataset, data_loader, epochs=EPOCHS) # 5. Calculate Validation Loss with torch.no_grad(): valid_samples = valid_dataset[:] outputs = model(valid_samples) valid_loss = criterion(outputs, valid_samples.label) valid_losses.append(valid_loss) end_time = time() models.append(model) train_losses_list.append(train_losses) l.write(f'Model completed in {(end_time - start_time)/60:.02f}m.\n') l.write('# Results') uniform_mask = [hp['weighting'] == 'uniform' for hp in hyperparams_list] models = [m for i, m in enumerate(models) if uniform_mask[i]] train_losses_list = [ losses for i, losses in enumerate(train_losses_list) if uniform_mask[i] ] valid_losses = [ loss.item() for i, loss in enumerate(valid_losses) if uniform_mask[i] ] best_model_idx = valid_losses.index(min(valid_losses)) best_model = models[best_model_idx] l.write(f'Best Model: {best_model_idx+1}') l.write('Computing Model Accuracy...') samples = valid_dataset[:] predictions = best_model.predict(samples) total = len(samples.label) correct = torch.sum(predictions == samples.label) l.write(f'Accuracy of Model: {(float(correct) / total)*100:.02f}%.') l.write('Persisting Models...') with s3_write('ml/models/news_classifier/glove_model.torch', 'b') as file: torch.save(best_model.state_dict(), file) l.write('Done!')
import preprocessing_photos import modeling_k_means import preprocessing_user_preferences import recommend_for_each_user import datetime from utils import logger #logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started preprocessing_photos')) #logger.flush() #preprocessing_photos.main() #logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started modeling_k_means')) #logger.flush() #modeling_k_means.main() #logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started preprocessing_user_preferences')) #logger.flush() #preprocessing_user_preferences.main() #logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started recommend_for_each_user')) #logger.flush() recommend_for_each_user.main('v0.9.0') logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Finished')) logger.flush() logger.close()
buildoption.append(v[4]) my_upload[key] = tuple(buildoption) utils.buildall(None, my_upload) if logger.errors: sys.exit(1) utils.upload_binaries() if logger.errors: sys.exit(1) # here it applies specific patch and shares libraries if csv_feature == True: cmd = ''.join(["hg import ", my_patchlocation]) p = Popen(cmd, cwd=my_x265_source, stdout=PIPE, stderr=PIPE) my_patchrevision = utils.hgversion(my_x265_source) if p.returncode: logger.write('\nfailed to apply patch\n') p = Popen("hg revert --all", cwd=my_x265_source, stdout=PIPE, stderr=PIPE) p = Popen("hg clean", cwd=my_x265_source, stdout=PIPE, stderr=PIPE) cmd = ''.join(["hg strip ", my_patchrevision]) p = Popen(cmd, cwd=my_x265_source, stdout=PIPE, stderr=PIPE) else: utils.buildall(None, my_upload) extras = [ '--psnr', '--ssim', '--csv-log-level=3', '--csv=test.csv', '--frames=10'
def logJson(self): self.log['final']['SimulationTime'] = self.getTime() logger.write(self.log)