Esempio n. 1
0
def main():
    # modeling photos
    modeling_photos()

    # modeling users
    # modeling_users()

    # print('Finished.')
    logger.write('Finished.' + '\n')
Esempio n. 2
0
def train_model(data: TData, epochs: int, batch_size: int,
                lr: float) -> TModel:
    l.write('# Setting Up Data')
    l.write(f'Training example count: {len(data)}')

    encoding = BOWEncoding(data, min_word_freq=5)
    encoding.prepare()

    dataset = WordTokenDataset(data, encoding)
    dataset.prepare()

    l.write('# Training')

    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=False,
                             collate_fn=data_utils.collate_samples)

    model = Model(vocab_size=encoding.vocab_size,
                  n_classes=encoding.n_classes())

    criterion = torch.nn.CrossEntropyLoss()

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        epoch_total_loss = 0

        epoch_progress = l.progressbar(key=f'epoch-{epoch}',
                                       name=f'Training Epoch {epoch + 1}')

        epoch_progress.show()

        batch_count = len(data_loader)

        for i, samples in enumerate(data_loader):
            optimizer.zero_grad()
            output = model(samples)
            loss = criterion(output, samples.label)
            loss.backward()
            optimizer.step()

            epoch_progress.set_progress((i + 1) / float(batch_count))
            epoch_total_loss += loss.item()

        # Log the accuracy on predicting the first x examples.
        samples = dataset[:10000]
        predictions = model.predict(samples)
        labels = samples.label

        total = len(labels)
        correct = torch.sum(labels == predictions)

        l.write(f'Accuracy: {float(correct)/total*100:.02f}%.')
        l.write(f'Training Loss: {epoch_total_loss}')

    return model
def main():
    if not os.path.exists(CLEAN_DATA_PATH):
        os.makedirs(CLEAN_DATA_PATH)
    build_photo_examples(os.path.join(RAW_DATA_PATH, DATASET_TRAIN_FACE),
                         os.path.join(RAW_DATA_PATH, DATASET_TRAIN_TEXT),
                         os.path.join(CLEAN_DATA_PATH, 'train_photo_examples'))
    build_photo_examples(os.path.join(RAW_DATA_PATH, DATASET_TEST_FACE),
                         os.path.join(RAW_DATA_PATH, DATASET_TEST_TEXT),
                         os.path.join(CLEAN_DATA_PATH, 'test_photo_examples'))
    # print('Finished.')
    logger.write('Finished.' + '\n')
Esempio n. 4
0
def train_multiple(hyperparams_list, train_dataset, valid_dataset, encoding,
                   epochs):
    models = []
    train_losses_list = []
    valid_losses = []

    for i, hyperparams in enumerate(hyperparams_list):
        l.write(f'## Model {i+1} / {len(hyperparams_list)}...')

        start_time = time.time()

        batch_size = hyperparams['batch_size']
        lr = hyperparams['lr']

        # 1. Setup Data Loader

        data_loader = DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 collate_fn=data_utils.collate_samples)

        # 2. Create the Model

        model = Model(vocab_size=encoding.vocab_size,
                      n_classes=encoding.n_classes())

        # 3. Setup Criterion and Optimizer

        criterion = torch.nn.CrossEntropyLoss()

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        # 4. Train the Model

        train_losses = train(model, i, criterion, optimizer, train_dataset,
                             data_loader, epochs)

        # 5. Calculate Validation Loss

        with torch.no_grad():
            valid_samples = valid_dataset[:]
            outputs = model(valid_samples)
            valid_loss = criterion(outputs, valid_samples.label)
            valid_losses.append(valid_loss)

        end_time = time.time()

        models.append(model)
        train_losses_list.append(train_losses)

        l.write(f'Model completed in {(end_time - start_time)/60:.02f}m.\n')

    return models, train_losses_list, valid_losses
Esempio n. 5
0
def train(model,
          model_idx,
          criterion,
          optimizer,
          dataset,
          data_loader,
          epochs,
          should_log=True):
    train_losses = []
    log_every = 1
    train_loss_estimator_size = 10000

    for epoch in range(epochs):
        losses = []

        epoch_progress = l.progressbar(key=f'model_{model_idx}_epoch{epoch}',
                                       name=f'Training Epoch {epoch + 1}')

        epoch_progress.show()

        batch_count = len(data_loader)
        for i, samples in enumerate(data_loader):
            optimizer.zero_grad()
            output = model(samples)
            loss = criterion(output, samples.label)
            loss.backward()
            optimizer.step()

            losses.append(loss.item())
            epoch_progress.set_progress((i + 1) / float(batch_count))

        train_loss = np.mean(losses)
        train_losses.append(train_loss)

        if should_log and (epoch + 1) % log_every == 0:
            train_loss_estimator_start = max(
                1,
                len(dataset) - train_loss_estimator_size)
            random_start = torch.randint(high=train_loss_estimator_start,
                                         size=(1, )).item()

            samples = dataset[random_start:(random_start +
                                            train_loss_estimator_size)]
            predictions = model.predict(samples)
            labels = samples.label

            total = len(labels)
            correct = torch.sum(labels == predictions)

            l.write(f'Accuracy: {float(correct)/total*100:.02f}%.')
            l.write(f'Training Loss: {train_loss.item()}\n')

    return train_losses
Esempio n. 6
0
def pick_best_model(data: TData, models: List[TModel]) -> TModel:
    l.write('# Loading Data')

    encoding = BOWEncoding(data, min_word_freq=5)
    encoding.prepare()

    dataset = WordTokenDataset(data, encoding)
    dataset.prepare()

    valid_accuracies = []

    l.write('# Calculating Accuracies')

    samples = dataset[:]
    labels = samples.label

    for i, model in enumerate(models):
        l.write(f'Calculating accuracy for Model {i+1}')

        predictions = model.predict(samples)
        total = len(samples)
        correct = torch.sum(predictions == labels).item()

        accuracy = float(correct) / total
        valid_accuracies.append(accuracy)

    highest_accuracy = max(valid_accuracies)
    highest_accuracy_idx = valid_accuracies.index(highest_accuracy)
    best_model = models[highest_accuracy_idx]

    l.write(f'Best accuracy: {highest_accuracy*100:.02f}%')

    return best_model
Esempio n. 7
0
def main():
    K1s = [10, 30, 100, 300, 1000]
    num_process_stats = list()
    for K1 in K1s:
        num_process_stats.append(
            build_pop_examples(
                os.path.join(preprocessing_photos.RAW_DATA_PATH,
                             preprocessing_photos.DATASET_TRAIN_INTERACTION),
                os.path.join(preprocessing_photos.DATA_HOUSE_PATH,
                             'photo-{}.pkl'.format(K1)),
                os.path.join(preprocessing_photos.CLEAN_DATA_PATH,
                             'pop_examples-{}.txt'.format(K1))))

    # print('Examples building finished.')
    logger.write('Examples building finished.' + '\n')
    for tup in num_process_stats:
        # print('interacts #total: {}, #filtered for missing: {}, #filtered for pic: {}, #users: {}'.format(tup[0], tup[1], tup[2], tup[3]))
        logger.write(
            'interacts #total: {}, #filtered for missing in trained photos: {}, #filtered for duration time is 0: {}; #users for preferences: {}'
            .format(tup[0], tup[1], tup[2], tup[3]) + '\n')
def store(example_filename, NUM_TEXT_FEATURE, photos_id, face_info, text_info_photos):
    cnt = 0
    num_unfound_photo = 0
    examples = np.zeros(shape=(len(photos_id), 1 + NUM_FACE_FEATURE + NUM_TEXT_FEATURE), dtype=np.float32)
    examples[:, 0] = list(photos_id)
    for exam_idx, photo_id in enumerate(photos_id):
        if cnt % 10000 == 0:
            print('Generating {}: {}'.format(example_filename, cnt))
        if photo_id in face_info.keys():
            examples[exam_idx, 1: NUM_FACE_FEATURE + 1] = face_info[photo_id]
        if photo_id in text_info_photos:
            topic = photo_topic_map[photo_id]
            if NUM_TEXT_FEATURE == 1:
                examples[exam_idx, NUM_FACE_FEATURE + 1:] = [topic]
            else:
                idx = common_word_idx_map[topic] if topic in common_word_idx_map.keys() else 0
                examples[exam_idx, NUM_FACE_FEATURE + 1:] = embeddings[idx]
        else:
            num_unfound_photo += 1
        cnt += 1
    np.save(example_filename, examples)
    logger.write('#Unfound photo: {}'.format(num_unfound_photo) + '\n')
def build_photo_examples(face_filename, text_filename, example_filename_prefix):
    print()
    photos_id = set()  # integers
    face_info = dict()  # {photo_id: integer, features: []}
    cnt = 0
    with open(face_filename, 'r') as face_file:
        for line in face_file:
            cnt += 1
            if cnt % 10000 == 0:
                print('Processing {}: {}'.format(face_filename, cnt))
            line = line.strip()
            segs = line.split(maxsplit=1)
            if len(segs) == 2:
                photo_id = int(segs[0])
                faces_list = json.loads(segs[1])
                if type(faces_list) is list:
                    faces = np.array(faces_list, dtype=np.float32)
                    num_face = faces.shape[0]
                    face_occu = np.sum(faces[:, 0])
                    gender_pref = np.mean(faces[:, 1])
                    age = np.mean(faces[:, 2])
                    looking = np.mean(faces[:, 3])
                    face_info[photo_id] = [num_face, face_occu, gender_pref, age, looking]
                    photos_id.add(photo_id)
    # print('#photos with face info = {}'.format(len(face_info)))
    logger.write('#photos with face info = {}'.format(len(face_info)) + '\n')
    text_info_photos = set()  # integers
    if text_filename is not None:
        cnt = 0
        with open(text_filename, 'r') as text_file:
            for line in text_file:
                cnt += 1
                if cnt % 10000 == 0:
                    print('Processing {}: {}'.format(text_filename, cnt))
                line = line.strip()
                segs = line.split(maxsplit=1)
                if len(segs) == 2:
                    photo_id = int(segs[0])
                    text_info_photos.add(photo_id)
                    photos_id.add(photo_id)
    # print('#photos with text info = {}'.format(len(text_info_photos)))
    logger.write('#photos with text info = {}'.format(len(text_info_photos)) + '\n')
    # print('#photos in total = {}'.format(len(photos_id)))
    logger.write('#photos in total = {}'.format(len(photos_id)) + '\n')

    store(example_filename_prefix + '-topic.npy', 1, photos_id, face_info, text_info_photos)
    store(example_filename_prefix + '.npy', embeddings.shape[1], photos_id, face_info, text_info_photos)
def recommend(sub_prefix):
    print('Loading moldes...')
    photo_model_prefix = 'photo-'
    pop_examples_prefix = 'pop_examples-'
    magicians = list()
    for file in os.listdir(preprocessing_photos.DATA_HOUSE_PATH):
        if file.startswith(photo_model_prefix):
            photo_kmeans = joblib.load(os.path.join(preprocessing_photos.DATA_HOUSE_PATH, file))
            photo_kmeans.verbose = 0
            first_sep = file.index('-')
            second_sep = file.rindex('.')
            K1 = int(file[first_sep + 1: second_sep])
            pop_examples = np.loadtxt(os.path.join(preprocessing_photos.CLEAN_DATA_PATH, pop_examples_prefix + str(K1) + '.txt'),
                                  delimiter=',')
            if len(pop_examples.shape) == 1:
                pop_examples = pop_examples.reshape(-1, pop_examples.shape[0])
            magicians.append(Magician(photo_kmeans, K1, pop_examples))
    print('{} models loaded.'.format(len(magicians)))
    # sorting models by multiplication of inertia
    magicians.sort(key=attrgetter('total_inertia'))
    for magician in magicians:
        print(str(magician))
        print('#photo_cate_map={}\n'.format(len(magician.photo_cate_map)))

    # normalization
    print('Normalizing dataset...')
    photo_examples = np.load(os.path.join(preprocessing_photos.CLEAN_DATA_PATH, 'test_photo_examples.npy'))
    train_photo_examples = np.load(os.path.join(preprocessing_photos.CLEAN_DATA_PATH, 'train_photo_examples.npy'))
    scaler = MinMaxScaler()
    scaler.fit(train_photo_examples[:, 1:])
    data = scaler.transform(photo_examples[:, 1:])
    photo_idx_map = dict(zip(np.array(photo_examples[:, 0], dtype=int), range(photo_examples.shape[0])))
    del train_photo_examples

    # inference
    print('Inferring..')
    magician_predicts_map = dict()

    predict_data = pd.read_csv(os.path.join(preprocessing_photos.RAW_DATA_PATH, preprocessing_photos.DATASET_TEST_INTERACTION), delim_whitespace=True,
                               header=None, names=['user_id', 'photo_id', 'time', 'duration_time'])
    logger.write('Predict data size: {}'.format(predict_data.shape[0]) + '\n')

    # os.path.join(preprocessing_photos.DATA_HOUSE_PATH, sub_prefix + '-' + str(rank) + '_' + magician.name)
    for magician in magicians:
        magician_predicts_map[magician.name] = np.ndarray(shape=(predict_data.shape[0]), dtype=np.float32)

    tot_cnt = 0
    cnt_unk_photo = 0
    cnt_existed_photo = 0
    cnt_predict_photo = 0
    cnt_new_user = 0
    for i in range(predict_data.shape[0]):
        user_id = predict_data.loc[i, 'user_id']
        photo_id = predict_data.loc[i, 'photo_id']
        for magician in magicians:
            tot_cnt += 1
            if user_id not in magician.user_matrix_map:
                click_probability = max(magician.fashion)
                cnt_new_user += 1
            else:
                if photo_id in magician.photo_cate_map.keys():
                    cate_id = magician.photo_cate_map[photo_id]
                    cnt_existed_photo += 1
                elif photo_id in photo_idx_map.keys():  # Almost examples should hit here.
                    features = data[photo_idx_map[photo_id]]
                    cate_id = magician.photo_kmeans.predict(np.array([features]))[0]
                    cnt_predict_photo += 1
                else:  # No example should hit here.
                    cate_id = None
                    cnt_unk_photo += 1

                if cate_id is None:
                    click_probability = 0.0
                else:
                    matrix_idx = magician.user_matrix_map[user_id]
                    click_probability = magician.matrix[matrix_idx, cate_id]

            magician_predicts_map[magician.name][i] = click_probability
        if i % 10000 == 0:
            print('Predicted examples: {}'.format(i))
    # print('#new users={}, #existed={}, #predict={}, #unknown={}, #total={}\n'
    #       .format(cnt_new_user, cnt_existed_photo, cnt_predict_photo, cnt_unk_photo, tot_cnt)
    #       )
    logger.write('#new users={}, #existed={}, #predict={}, #new photos beyond train and test dataset={}, #total={}\n'
          .format(cnt_new_user, cnt_existed_photo, cnt_predict_photo, cnt_unk_photo, tot_cnt))

    print('Saving prediction...')
    for rank, magician in enumerate(magicians):
        predict_data['click_prob'] = magician_predicts_map[magician.name]
        predict_data.to_csv(os.path.join(preprocessing_photos.DATA_HOUSE_PATH, sub_prefix + '-' + str(rank) + '_' + magician.name),
                            columns=['user_id', 'photo_id', 'click_prob'],
                            sep='\t', header=False, index=False, float_format='%.6f')
def main(sub_prefix):
    recommend(sub_prefix)
    print('Finished.')
    logger.write('Finished.' + '\n')
Esempio n. 12
0
def main():
    l.write('# Loading Data')

    with s3_read('ml/data/news_classifier/train_data.json') as file:
        data = pd.read_json(file, orient='records')

    data = data.sample(frac=1)  # Shuffle the data.

    l.write(f'Training example count: {len(data)}')
    train_test_split = 0.95
    split_idx = math.floor(len(data) * train_test_split)

    train_data = data.iloc[0:split_idx]
    valid_data = data.iloc[split_idx:]

    encoding = BOWEncoding(data, min_word_freq=5)
    encoding.prepare()

    train_dataset = WordTokenDataset(train_data, encoding)
    train_dataset.prepare()

    valid_dataset = WordTokenDataset(valid_data, encoding)
    valid_dataset.prepare()

    l.write('# Training')

    hyperparams_list = [
        {
            'batch_size': 100,
            'lr': 1e-3
        },
        {
            'batch_size': 10,
            'lr': 1e-3
        },
        {
            'batch_size': 100,
            'lr': 1e-2
        },
        {
            'batch_size': 10,
            'lr': 1e-2
        },
    ]

    models, train_loss_list, valid_losses = train_multiple(hyperparams_list,
                                                           train_dataset,
                                                           valid_dataset,
                                                           encoding,
                                                           epochs=EPOCHS)

    l.write('# Viewing Results')
    best_model_idx = torch.argmin(torch.FloatTensor(valid_losses)).item()

    best_model = models[best_model_idx]

    l.write(f'Best Model: {best_model_idx+1}')

    valid_samples = valid_dataset[:]

    predictions = best_model.predict(valid_samples)

    total = len(valid_samples.label)
    correct = torch.sum(predictions == valid_samples.label)
    accuracy = float(correct) / total

    l.write(f'Accuracy of Best Model: {accuracy*100:.02f}%.')

    confusion_matrix, category_encoder = create_confusion_matrix(
        valid_samples.label, predictions)

    category_decoder = {i: c for c, i in category_encoder.items()}

    labeling_errors = top_k_labeling_errors(confusion_matrix,
                                            category_decoder,
                                            k=5)
    label_decoder = {i: l for l, i in encoding._label_encoder.items()}

    # Looking at the most frequent labeling errors.
    for i, error in enumerate(labeling_errors):
        error_0 = label_decoder[error[0]]
        error_1 = label_decoder[error[1]]
        l.write(f'{i+1}. "{error_0}" confused for "{error_1}"')

    l.write('# Persisting Model')
    with s3_write('ml/models/news_classifier/bow_model.torch', 'b') as file:
        torch.save(best_model.state_dict(), file)
Esempio n. 13
0
def main():

    l.write('# Loading and Setting Up Data')

    l.write('Loading Training Data')
    with s3_read('ml/data/news_classifier/train_data.json') as file:
        data = pd.read_json(file, orient="records")
        data = data[:1000]

    l.write('Loading embeddings')

    with s3_read('ml/glove_embeddings/glove.6B.100d.txt') as file:
        embeddings = data_utils.load_embeddings(file, embedding_dim=100)

    l.write('Preparing data')

    train_test_split = 0.95
    split_idx = math.floor(len(data) * train_test_split)

    train_data = data.iloc[0:split_idx]
    valid_data = data.iloc[split_idx:]

    encoding = WordEmbeddingEncoding(data, embeddings)
    encoding.prepare()

    train_dataset = WordTokenDataset(train_data, encoding)
    train_dataset.prepare()

    valid_dataset = WordTokenDataset(valid_data, encoding)
    valid_dataset.prepare()

    print('# Training the Model')

    hyperparams_list = [
        {
            'weighting': 'uniform',
            'lr': 0.001,
            'batch_size': 100
        },
        {
            'weighting': 'uniform',
            'lr': 0.01,
            'batch_size': 100
        },
        {
            'weighting': 'uniform',
            'lr': 0.001,
            'batch_size': 50
        },
        {
            'weighting': 'uniform',
            'lr': 0.01,
            'batch_size': 50
        },
    ]

    models = []
    train_losses_list = []
    valid_losses = []

    accepted_tokens = {t for t in embeddings.index}

    for i, hyperparams in enumerate(hyperparams_list):
        l.write(f'Model {i+1} / {len(hyperparams_list)}')

        start_time = time()

        batch_size = hyperparams['batch_size']
        lr = hyperparams['lr']
        weighting = hyperparams['weighting']

        # 1. Setup Data Loader

        data_loader = DataLoader(dataset=train_dataset,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 collate_fn=data_utils.collate_samples)

        # 2. Create the Model

        model = Model(embeddings=embeddings,
                      n_classes=encoding.n_classes(),
                      weighting=weighting)

        # 3. Setup Criterion and Optimizer

        criterion = torch.nn.CrossEntropyLoss()

        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        # 4. Train the Model

        train_losses = train(model,
                             criterion,
                             optimizer,
                             train_dataset,
                             data_loader,
                             epochs=EPOCHS)

        # 5. Calculate Validation Loss

        with torch.no_grad():
            valid_samples = valid_dataset[:]

            outputs = model(valid_samples)

            valid_loss = criterion(outputs, valid_samples.label)
            valid_losses.append(valid_loss)

        end_time = time()

        models.append(model)
        train_losses_list.append(train_losses)

        l.write(f'Model completed in {(end_time - start_time)/60:.02f}m.\n')

    l.write('# Results')

    uniform_mask = [hp['weighting'] == 'uniform' for hp in hyperparams_list]

    models = [m for i, m in enumerate(models) if uniform_mask[i]]
    train_losses_list = [
        losses for i, losses in enumerate(train_losses_list) if uniform_mask[i]
    ]
    valid_losses = [
        loss.item() for i, loss in enumerate(valid_losses) if uniform_mask[i]
    ]

    best_model_idx = valid_losses.index(min(valid_losses))
    best_model = models[best_model_idx]

    l.write(f'Best Model: {best_model_idx+1}')
    l.write('Computing Model Accuracy...')

    samples = valid_dataset[:]

    predictions = best_model.predict(samples)

    total = len(samples.label)
    correct = torch.sum(predictions == samples.label)

    l.write(f'Accuracy of Model: {(float(correct) / total)*100:.02f}%.')

    l.write('Persisting Models...')

    with s3_write('ml/models/news_classifier/glove_model.torch', 'b') as file:
        torch.save(best_model.state_dict(), file)

    l.write('Done!')
Esempio n. 14
0
import preprocessing_photos
import modeling_k_means
import preprocessing_user_preferences
import recommend_for_each_user

import datetime
from utils import logger

#logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started preprocessing_photos'))
#logger.flush()
#preprocessing_photos.main()
#logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started modeling_k_means'))
#logger.flush()
#modeling_k_means.main()
#logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started preprocessing_user_preferences'))
#logger.flush()
#preprocessing_user_preferences.main()
#logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Started recommend_for_each_user'))
#logger.flush()
recommend_for_each_user.main('v0.9.0')
logger.write('{}: {}\n'.format(datetime.datetime.now(), 'Finished'))
logger.flush()
logger.close()
Esempio n. 15
0
            buildoption.append(v[4])
            my_upload[key] = tuple(buildoption)
        utils.buildall(None, my_upload)
        if logger.errors:
            sys.exit(1)
        utils.upload_binaries()
        if logger.errors:
            sys.exit(1)

        # here it applies specific patch and shares libraries
        if csv_feature == True:
            cmd = ''.join(["hg import ", my_patchlocation])
            p = Popen(cmd, cwd=my_x265_source, stdout=PIPE, stderr=PIPE)
            my_patchrevision = utils.hgversion(my_x265_source)
            if p.returncode:
                logger.write('\nfailed to apply patch\n')
                p = Popen("hg revert --all",
                          cwd=my_x265_source,
                          stdout=PIPE,
                          stderr=PIPE)
                p = Popen("hg clean",
                          cwd=my_x265_source,
                          stdout=PIPE,
                          stderr=PIPE)
                cmd = ''.join(["hg strip ", my_patchrevision])
                p = Popen(cmd, cwd=my_x265_source, stdout=PIPE, stderr=PIPE)
            else:
                utils.buildall(None, my_upload)
                extras = [
                    '--psnr', '--ssim', '--csv-log-level=3', '--csv=test.csv',
                    '--frames=10'
Esempio n. 16
0
 def logJson(self):
     self.log['final']['SimulationTime'] = self.getTime()
     logger.write(self.log)