Exemple #1
0
 def predict(self, fn):
     '''样本预测;'''
     pt = Preprocessor()
     tmp = pt.img2vec(fn)
     X_test = tmp.reshape(1, -1)
     ans = self.clf.predict(X_test)
     return ans
def main(args):

    if args.dataset == 'Dolphin18k':
        from utils import Preprocessor as Preprocessor
    elif args.dataset == 'Math23k':
        from utils import Math23kPreprocessor as Preprocessor
    else:
        logging.error('Not compitable dataset!')
        return

    if args.index is not None:
        with open(args.index) as f:
            shuffled_index = json.load(f)
    else:
        shuffled_index = None

    preprocessor = Preprocessor(args.embedding_path)

    train, valid = preprocessor.get_train_valid_dataset(args.data_path,
                                                        args.valid_ratio,
                                                        index=shuffled_index,
                                                        char_based=args.char_based)

    with open(args.output, 'wb') as f:
        pickle.dump({'train': train,
                     'valid': valid,
                     'preprocessor': preprocessor}, f)
Exemple #3
0
def eval_db_agent(env, params):
    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = VAE(params['state_dim'], params['action_dim'])
    if params['use_cuda']:
        agent = agent.cuda()
        agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name'])))
    else:
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu'))
    agent.eval()

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1, params['num_episodes'] + 1):
        env_state = env.reset()
        episode_reward = 0.0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            action, state_val = agent.sample_action_eval(var_state)

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            episode_reward += reward

            if terminal:
                break

        episode_rewards.append(episode_reward)
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3} | Total Time {4}' \
            .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100,
                    timeSince(start, episode / params['num_episodes']))
Exemple #4
0
class TestFeatureAdder(unittest2.TestCase):
    def setUp(self):
        self.stats_calc = StatsCalculator()
        self.preprocessor = Preprocessor()
        self.feature_adder = FeatureAdder()
        self.col_names = [f'feature_{i}' for i in range(FEATURES)]

    def tearDown(self):
        self.stats_calc = None
        self.preprocessor = None
        self.feature_adder = None

    def _get_df(self):
        df = pd.read_csv('data/train.tsv', sep='\t')
        df = self.preprocessor.split_features(df)
        df = self.preprocessor.f_to_int(df)
        return df

    def test_max_index_feature(self):
        """
        Test that new feature 'max_feature_2_index' lies in proper range and has dtype 'int64'
        """
        df = self._get_df()
        new_feature = 'max_feature_2_index'

        df = self.feature_adder.max_index_feature(df)
        valid_range, valid_dtype = (0, 255), 'int64'

        assert df[new_feature].between(*valid_range).all() and df[new_feature].dtype == valid_dtype, \
            "max_feature_2_index feature not in range OR has wrong dtype"

    def test_abs_mean_diff_feature(self):
        """
        Test that new feature 'max_feature_2_abs_mean_diff' is valid
        """
        df = self._get_df()
        df = self.feature_adder.max_index_feature(df)
        new_feature = 'max_feature_2_abs_mean_diff'
        cols = np.array(self.col_names)[df['max_feature_2_index'].values]
        train_stats = find_train_stats('data/train.tsv', chunksize=10000)
        df = self.feature_adder.abs_mean_diff_feature(
            df.loc[:, df.columns != 'id_job'], train_stats)
        results = []

        for i, col in enumerate(cols):
            # keep in mind outliers in test data
            lower_bound, upper_bound = 0, train_stats[col]['std']
            results.append(lower_bound <= df[new_feature][i] <= upper_bound)

        self.assertTrue(
            np.all(results),
            "max_feature_2_index feature not in expected range OR has wrong dtype"
        )
Exemple #5
0
def main(test_csv, test_target_csv, prediction_csv, model_dir):
    start_time = time.time()

    # load model
    model_config_filename = os.path.join(model_dir, 'model_config.pkl')
    metric_file = os.path.join(model_dir, 'rating.txt')

    with open(model_config_filename, 'rb') as fin:
        model_config = pickle.load(fin)

    # read dataset
    df = pd.read_csv(test_csv)
    print('Dataset read, shape {}'.format(df.shape))

    line_id = df['line_id']

    preprocessor = Preprocessor(model_config['features'])
    df_X = preprocessor.transform(df)

    model = model_config['model']

    if model_config['mode'] == 'regression':
        df['prediction'] = model.predict(df_X.values)

    elif model_config['mode'] == 'classification':
        df['prediction'] = model.predict_proba(df_X.values)[:, 1]

    df['line_id'] = line_id
    df[['line_id', 'prediction']].to_csv(prediction_csv, index=False)

    print('Prediction time: {}'.format(time.time() - start_time))

    if test_target_csv:
        def save_metric(metric):
            with open(metric_file, 'a') as f:
                f.write('{}\n'.format(metric))

        # read targets
        test = pd.read_csv(test_target_csv)

        print('Read targets, shape {}'.format(test.shape))

        if model_config['mode'] == 'regression':
            pred = preprocessor.target_inverse_transform(df['prediction'])
            mse = np.mean((test.target - pred)**2)
            r_2 = 1 - mse/np.std(test.target)**2
            print('MSE: {}'.format(mse))
            print('R^2: {}'.format(r_2))
            save_metric(r_2)
        elif model_config['mode'] == 'classification':
            auc = roc_auc_score(test.target, df['prediction'])
            print('AUC: {}'.format(auc))
            save_metric(auc)
Exemple #6
0
def run():
    pt = Preprocessor()
    tr = Trainer()
    X_train, y_train = pt.load_data()
    X_test, y_test = pt.load_data("mnist_test_data.npz")

    x1 = X_train.reshape((-1, 28, 28, 1))
    x2 = X_test.reshape((-1, 28, 28, 1))

    y1 = keras.utils.to_categorical(y_train, len(np.unique(y_train)))
    y2 = keras.utils.to_categorical(y_test, len(np.unique(y_test)))

    clf = tr.cnn(x1, y1, x2, y2)
    tr.save(clf, "cnn_mnist_keras.h5")
    return clf
Exemple #7
0
def train_agent(cmdl):
    step_cnt = 0
    ep_cnt = 0
    preprocess = Preprocessor(cmdl.env_class).transform

    env = utils.get_new_env(cmdl.env_name)
    agent = get_agent(cmdl.agent.name)(env.action_space, cmdl.agent)
    display_setup(env, cmdl)

    start_time = time.time()
    while step_cnt < cmdl.training.step_no:

        ep_cnt += 1
        o, r, done = env.reset(), 0, False
        s = preprocess(o)

        while not done:
            a = agent.evaluate_policy(s)
            o, r, done, _ = env.step(a)
            _s, _a = s, a
            s = preprocess(o)
            agent.improve_policy(_s, _a, r, s, done)

            step_cnt += 1
            agent.gather_stats(r, done)

        if ep_cnt % cmdl.report_freq == 0:
            agent.display_stats(start_time)
            agent.display_model_stats()

    end_time = time.time()
    display_stats(ep_cnt, step_cnt, end_time - start_time)
    """
Exemple #8
0
def main():
    # create prediction dataframe and score dataframe list
    prediction_df = pd.DataFrame()
    score_df_list = []

    # transform, predict, score for each combination
    for scaler, (model_class, params) in product(config.scalers, config.models):

        # assign preprocessor and model
        preprocessor = Preprocessor(scaler)
        model = model_class(**params)

        # process train, validation, test data
        train_score, validation_score = process_train_data(preprocessor, model)
        test_prediction, test_score = process_test_data(preprocessor, model)

        # add predicted data to prediction dataframe
        prediction_df[
            f"{scaler.__name__}-{model}"
        ] = test_prediction

        # add score dataframes to the list
        score_df_list.extend([train_score, validation_score, test_score])

    # concatenate score dataframes
    report_df = pd.concat(score_df_list)

    # save prediction and report to csv files
    prediction_df.to_csv(config.prediction_file_path)
    report_df.to_csv(config.report_file_path)
Exemple #9
0
 def __init__(self, config_path):
     self.stream_api = StreamAPI.StreamAPI(config_path)
     self.sorter = TweetSorter.TweetSorter()
     self.preprocessor = Preprocessor.Preprocessor()
     self.selector = AttributeSelector.AttributeSelector()
     self.saver = ImageSaver.ImageSaver()
     self.vader = SentimentAnalyzer.SentimentAnalyzer()
     self.db = []
     self.tweets_num = 0
Exemple #10
0
def run_train():
    #    t0 = time.time()
    pt = Preprocessor()
    tr = Trainer_nn()

    X_train, y_train = pt.get_data_labels()
    X_test, y_test = pt.get_data_labels("test")

    #    X_train, y_train = pt.load_data()
    #    X_test, y_test = pt.load_data("mnist_test_data.npz")

    clf = tr.mlp(X_train, y_train)
    tr.save_model(clf, "mlp_mnist_Hu300x300ReluSgdIter100Acc96Sample60000.m")

    tester = Tester("mlp_mnist_Hu300x300ReluSgdIter100Acc96Sample60000.m")
    mt, score, repo = tester.clf_quality(X_test, y_test)
    print(mt, score, repo)
    return clf
Exemple #11
0
def run_train():
    t0 = time.time()
    pt = Preprocessor()
    tr = Trainer()

    X_train, y_train = pt.get_data_labels()
    X_test, y_test = pt.get_data_labels("test")

    t1 = time.time()
    print(t1 - t0)
    clf = tr.svc(X_train, y_train)
    print(time.time() - t1)

    tr.save_model(clf, "mnist_svm.m")

    tester = Tester("mnist_svm.m")
    mt, score, repo = tester.clf_quality(X_test, y_test)
    print(mt, score, repo)
    return clf
Exemple #12
0
def run():
    pt = Preprocessor()
    tr = Trainer()
    ts = Tester()
    t0 = time.time()
    X_train, y_train = pt.load_data()
    X_test, y_test = pt.load_data("mnist_test_data.npz")

    X_train, y_train = make_shuffle(X_train, y_train)
    X_test, y_test = make_shuffle(X_test, y_test)

    X_train = X_train.reshape((-1, 1, 28, 28))
    X_test = X_test.reshape((-1, 1, 28, 28))
    print(time.time() - t0)
    t1 = time.time()
    clf = tr.net(X_train, y_train)
    print(time.time() - t1)
    acc = ts.get_acc(clf, X_test, y_test)  #acc=97.8%

    return clf, acc
Exemple #13
0
def main():
    emotionals, rationals = emotional_rational()

    preprocessor = Preprocessor()
    emotionals = preprocessor.parse_sentences(emotionals)
    rationals = preprocessor.parse_sentences(rationals)

    train_pos = emotionals[:len(emotionals) // 2]
    train_neg = rationals[:len(rationals) // 2]

    test_pos = emotionals[len(emotionals) // 2:]
    test_neg = rationals[len(rationals) // 2:]

    vectorizer = CountVectorizer()

    X_train = vectorizer.fit_transform(train_pos + train_neg)
    y_train = np.array([1] * len(train_pos) + [0] * len(train_neg))

    X_test = vectorizer.transform(test_pos + test_neg)
    y_test = np.array([1] * len(test_pos) + [0] * len(test_neg))

    print('Vocabulary size : {}'.format(len(vectorizer.vocabulary_)))

    nbsvm = NBSVM()
    nbsvm.fit(X_train, y_train)

    print('Test accuracy : {}'.format(nbsvm.score(X_test, y_test)))

    y_pred = nbsvm.predict(X_test)
    print('F1 score : {}'.format(f1_score(y_test, y_pred, average='macro')))

    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    roc_auc = auc(fpr, tpr)
    print('AUC of emotionals : {}'.format(roc_auc))
    plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_emotional_roc.png')

    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=0)
    roc_auc = auc(fpr, tpr)
    print('AUC of rationals : {}'.format(roc_auc))
    plot_roc_curve(fpr, tpr, roc_auc, 'nbsvm_rational_roc.png')
Exemple #14
0
def evaluate_agent(crt_training_step, eval_env, eval_agent, policy, cmdl):
    print("[Evaluator]  Initializing at %d training steps:" %
          crt_training_step)
    agent = eval_agent

    eval_env.get_crt_step(crt_training_step)
    agent.policy_evaluation.policy.load_state_dict(policy.state_dict())
    preprocess = Preprocessor(cmdl.env_class).transform

    step_cnt = 0
    o, r, done = eval_env.reset(), 0, False
    while step_cnt < cmdl.evaluator.eval_steps:
        s = preprocess(o)
        a = agent.evaluate_policy(s)
        o, r, done, _ = eval_env.step(a)
        step_cnt += 1
        if done:
            o, r, done = eval_env.reset(), 0, False
Exemple #15
0
def train_agent(cmdl):
    step_cnt = 0
    ep_cnt = 0
    start_time = time.time()

    env = utils.get_new_env(cmdl.env_name, cmdl)
    eval_env = EvaluationMonitor(gym.make(cmdl.env_name), cmdl)

    name = cmdl.agent.name
    agent = get_agent(name)(env.action_space, cmdl.agent)
    eval_agent = get_agent(name)(eval_env.action_space, cmdl.agent, False)

    preprocess = Preprocessor(cmdl.env_class).transform
    agent.display_setup(env, cmdl)

    while step_cnt < cmdl.training.step_no:

        ep_cnt += 1
        o, r, done = env.reset(), 0, False
        s = preprocess(o)

        while not done:
            a = agent.evaluate_policy(s)
            o, r, done, _ = env.step(a)
            _s, _a = s, a
            s = preprocess(o)
            agent.improve_policy(_s, _a, r, s, done)

            step_cnt += 1
            agent.gather_stats(r, done)

        if step_cnt % cmdl.report_freq == 0:
            agent.display_stats(start_time)
            agent.display_model_stats()
            gc.collect()

        if step_cnt % cmdl.eval_freq == 0:
            evaluate_agent(step_cnt, eval_env, eval_agent, agent.policy, cmdl)

    end_time = time.time()
    agent.display_final_report(ep_cnt, step_cnt, end_time - start_time)
Exemple #16
0
def main():
    emotionals, rationals = emotional_rational()

    preprocessor = Preprocessor()
    emotionals = preprocessor.parse_sentences(emotionals)
    rationals = preprocessor.parse_sentences(rationals)

    emotionals = emotionals[:len(emotionals)]
    rationals = rationals[:len(emotionals)]

    sentences = emotionals + rationals
    Y = np.array([[0, 1]] * len(emotionals) + [[1, 0]] * len(rationals))

    max_features = 200
    tokenizer = Tokenizer(num_words=max_features, split=' ')
    tokenizer.fit_on_texts(sentences)

    X = tokenizer.texts_to_sequences(sentences)
    X = pad_sequences(X, maxlen=MAX_LEN)

    epochs = 15

    # --- Add Features ---
    dict_loader = EmotionalDict('dataset/nouns', 'dataset/verbs')
    emotional_dict = dict_loader.load()

    features_loader = AdditionalFeatures(emotionals+rationals, emotional_dict)
    add_features = features_loader.emotional_features()
    ######################

    x_aux_train = add_features[:848]
    x_aux_test = add_features[848:]

    model = build_model(x_aux_train.shape)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    print(X_train.shape, Y_train.shape)
    print(X_test.shape, Y_test.shape)

    batch_size = 32
    model.fit({'main_input': X_train, 'add_input': x_aux_train}, Y_train, epochs=epochs, batch_size=batch_size, verbose=2)

    score, acc = model.evaluate({'main_input': X_test, 'add_input': x_aux_test}, Y_test, verbose=2, batch_size=batch_size)

    print('score: {}'.format(score))
    print('acc: {}'.format(acc))

    Y_pred = model.predict({'main_input': X_test, 'add_input': x_aux_test}, batch_size=1, verbose=2)

    print(classification_report(Y_test[:, 1], np.round(Y_pred[:, 1]), target_names=['rationals', 'emotionals']))

    fpr, tpr, _ = roc_curve(Y_test[:, 1], Y_pred[:, 1])
    roc_auc = auc(fpr, tpr)
    plot_roc_curve(fpr, tpr, roc_auc, 'roc.png')

    cnf_matrix = confusion_matrix(Y_test[:, 1], np.round(Y_pred[:, 1]))
    plot_confusion_matrix(cnf_matrix, ['rationals', 'emotionals'], 'cnf.png')

    attention_vector = np.mean(get_activations(model, X_test, True, 'attention_vec')[0], axis=2).squeeze()
    attention_vector = np.mean(attention_vector, axis=0)

    import matplotlib.pyplot as plt
    import pandas as pd
    pd.DataFrame(attention_vector, columns=['attention (%)']).plot(kind='bar', title='Attention')
    plt.savefig('attention_vec.png')

    attention_vector_indices = np.argsort(attention_vector)[::-1]

    word_index = tokenizer.word_index
    word_index_inv = {v: k for k, v in word_index.items()}

    with open('attention_word.txt', 'w') as f:
        for i, attention_index in enumerate(attention_vector_indices, start=1):
            try:
                print('No.{} : {}'.format(i, word_index_inv[attention_index]), file=f)
            except KeyError:
                continue
Exemple #17
0
from utils import Preprocessor

if __name__ == '__main__':
    print('Initializing preprocessor')
    preprocessor = Preprocessor()
    print('Running preprocessor')
    preprocessor.run()
    print('Saving trie and inverted index')
    preprocessor.save()
    print('Preprocessor stats')
    max_key_length = max(map(len, preprocessor.stats.keys()))
    for k, v in preprocessor.stats.items():
        print(f"{k.ljust(max_key_length)}: {v}")

    print('Most common tokens')
    max_key_length = max(map(len, preprocessor.most_common.keys()))
    for token, token_count in preprocessor.most_common.items():
        print(f"{repr(token).ljust(max_key_length+2)} appeared at least ONCE in {str(token_count).ljust(5)} documents")
    print('Done')
Exemple #18
0
def train(args):
    vocab = Vocab.load(args.vocab, max_size=args.vocab_size)
    data_reader = DataReader(data_dir=args.data_dir, shuffle=True)
    preprocessor = Preprocessor(
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next,
        vocab=vocab, max_length=args.max_length, gpu=args.gpu)
    model = SkipThought(
        rnn_type=args.rnn_type, num_words=len(vocab),
        word_dim=args.word_dim, hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next)
    print(model)

    if args.pretrained is not None:
        print(f'Loading pretrained model from {args.pretrained}')
        model.load_state_dict(
            torch.load(args.pretrained,
                       map_location=lambda storage, loc: storage))
    if args.gpu > -1:
        model.cuda(args.gpu)
    optimizer = optim.Adam(model.parameters())

    summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log'))

    def add_scalar_summary(name, value, step):
        summary_writer.add_scalar(tag=name, scalar_value=value,
                                  global_step=step)

    def add_text_summary(name, value, step):
        summary_writer.add_text(tag=name, text_string=value,
                                global_step=step)

    def variable(tensor, volatile=False):
        return Variable(tensor, volatile=volatile)

    def run_train_iter(batch):
        if not model.training:
            model.train()
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0]), tgt[k][1])
        logits = model.forward(src=src, tgt=tgt)
        loss = 0
        for k in tgt:
            logits_k = logits[k]
            tgt_k = tgt[k]
            loss = loss + basic.sequence_cross_entropy(
                logits=logits_k[:-1], targets=tgt_k[0][1:],
                length=tgt_k[1] - 1)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(), max_norm=10)
        optimizer.step()
        return loss.data[0]

    def ids_to_words(ids):
        words = []
        eos_id = vocab.stoi(vocab.eos)
        for id_ in ids:
            words.append(vocab.itos(id_))
            if id_ == eos_id:
                break
        return words

    def generate_using_decoder(name, src, max_length):
        _, encoder_state = model.encoder(words=src[0], length=src[1])
        if isinstance(encoder_state, tuple):  # LSTM
            encoder_state = encoder_state[0]
        context = (encoder_state.transpose(0, 1).contiguous()
                   .view(-1, args.hidden_dim))
        batch_size = src[1].size(0)

        bos_id = vocab.stoi(vocab.bos)
        bos = Variable(src[1].new(1, batch_size).fill_(bos_id))
        decoder = model.get_decoder(name)
        prev_pred = bos
        done = torch.zeros(batch_size).byte()
        hyps = []
        prev_state = context.unsqueeze(0)
        for t in range(max_length):
            if done.all():
                break
            decoder_input = prev_pred
            logit, prev_state = decoder(words=decoder_input,
                                        prev_state=prev_state)
            pred = logit.max(2)[1]
            prev_pred = pred
            hyps.append(pred.data)
        hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist()
        return hyps

    def generate(batch):
        # Greedy search
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1])
        batch_size = src[0].size(1)
        max_length = src[0].size(0) * 2
        generated = {}
        for k in tgt:
            generated[k] = generate_using_decoder(
                name=k, src=src, max_length=max_length)
        results = []
        for i in range(batch_size):
            res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)),
                   'tgt': {},
                   'out': {}}
            for k in tgt:
                res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data))
                res['out'][k] = ' '.join(ids_to_words(generated[k][i]))
            results.append(res)
        return results

    def generate_synthetic_batch(real_batch):
        def sort_by_length(tgt_of_key):
            sorted_length, sort_inds = tgt_of_key[1].sort(
                dim=0, descending=True)
            return tgt_of_key[0][:, sort_inds], sorted_length

        # Forward: given prev, generate cur'
        _, tgt = preprocessor(real_batch)
        tgt_prev, tgt_prev_length = sort_by_length(tgt['prev'])
        syn_src_fw = generate_using_decoder(
            name='next',
            src=(variable(tgt_prev[1:], volatile=True),
                 tgt_prev_length - 1),
            max_length=args.max_length)
        # Backward: given next, generate cur''
        tgt_next, tgt_next_length = sort_by_length(tgt['next'])
        syn_src_bw = generate_using_decoder(
            name='prev',
            src=(variable(tgt_next[1:], volatile=True),
                 tgt_next_length - 1),
            max_length=args.max_length)
        syn_batch_fw = []
        syn_batch_bw = []
        for i in range(len(real_batch)):
            syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i]))
            syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i]))
            syn_batch_fw.append(
                (real_batch[i][0], syn_src_fw_str, real_batch[i][2]))
            syn_batch_bw.append(
                (real_batch[i][0], syn_src_bw_str, real_batch[i][2]))
        return syn_batch_fw, syn_batch_bw

    global_step = 0

    def print_samples():
        model.eval()
        num_samples = 2
        samples = data_reader.next_batch(size=num_samples, peek=True)
        syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples)
        gen_results = generate(samples)
        syn_gen_results_fw = generate(syn_samples_fw)
        syn_gen_results_bw = generate(syn_samples_bw)
        text_val = ''
        for i, res in enumerate(gen_results):
            text_val += f'* sample (real) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_fw):
            text_val += f'* sample (syn_fw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_bw):
            text_val += f'* sample (syn_bw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        add_text_summary('Sample', value=text_val, step=global_step)

    for epoch in range(args.max_epoch):
        data_reader.start_epoch()
        for batch in tqdm(data_reader.iterator(args.batch_size),
                          desc=f'Epoch {epoch}'):
            # Train on real batch
            real_loss = run_train_iter(batch)
            # Train on synthetic batches
            syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch)
            syn_loss_fw = run_train_iter(syn_batch_fw)
            syn_loss_bw = run_train_iter(syn_batch_bw)
            global_step += 1
            add_scalar_summary(name='real_loss', value=real_loss,
                               step=global_step)
            add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw,
                               step=global_step)
            add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw,
                               step=global_step)
            if global_step % args.print_every == 0:
                print_samples()
            if global_step % args.save_every == 0:
                model_filename = f'model-{global_step}.pt'
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                print(f'\nIter #{global_step}: '
                      f'Saved checkpoint to {model_path}')
Exemple #19
0
        text, return_offsets_mapping=True, add_special_tokens=False)[
            "offset_mapping"]
elif config["encoder"] == "BiLSTM":
    tokenize = lambda text: text.split(" ")

    def get_tok2char_span_map(text):
        tokens = tokenize(text)
        tok2char_span = []
        char_num = 0
        for tok in tokens:
            tok2char_span.append((char_num, char_num + len(tok)))
            char_num += len(tok) + 1  # +1: whitespace
        return tok2char_span


preprocessor = Preprocessor(tokenize_func=tokenize,
                            get_tok2char_span_map_func=get_tok2char_span_map)
ori_format = config["ori_data_format"]
if ori_format != "tplinker":  # if tplinker, skip transforming
    for file_name, data in file_name2data.items():
        if "train" in file_name:
            data_type = "train"
        if "valid" in file_name:
            data_type = "valid"
        if "test" in file_name:
            data_type = "test"
        data = preprocessor.transform_data(data,
                                           ori_format=ori_format,
                                           dataset_type=data_type,
                                           add_id=True)
        file_name2data[file_name] = data
from configuration import Config
from utils import CharaterTable, Preprocessor
from CaptionModel import CaptionModel
from tensorflow import flags
FLAGS = flags.FLAGS
flags.DEFINE_integer("caption_len", 25, "The length of caption")
flags.DEFINE_string(
    "model_weights",
    "/home/suxin/ImageCaption/ImageCaption_coding_test/checkpoint/weights.031-0.776.hdf5",
    "The weights file of test model")

config = Config()
data = Preprocessor(config)
ctable = CharaterTable(data.train_captions + data.val_captions)

caption_len = FLAGS.caption_len

caption_model = CaptionModel(image_len=data.image_len,
                             caption_len=caption_len,
                             vocab_size=ctable.vocab_size,
                             ifpool=config.ifpool)

caption_model.build_inference_model(FLAGS.model_weights, beam_search=False)
result = caption_model.inference(data.val_set)
num = result.shape[0]

captions = [ctable.decode(result[i], calc_argmax=False) for i in range(num)]

for i, caption in enumerate(captions):
    print i + 8001,
    for word in caption:
from utils import save_model
from utils import loader
from featurizer import Featurize
from sentiment_analyzer import NaiveBayes
from sklearn.model_selection import train_test_split
from evaluator import evaluate_accuracy
import pickle

X, y = loader()
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
print(X_train)

token = Preprocessor()
file1 = open('preprocessor.obj', 'wb')
pickle.dump(token, file1)
file1.close()

train_preprocessed = [token.tweet_cleaner(i) for i in X_train]
f = Featurize()
train_features = f.vectorize_train(train_preprocessed)
file2 = open('featurizer.obj', 'wb')
pickle.dump(f, file2)
file2.close()

model = NaiveBayes()
clf = model.train(train_features, labels=y_train)
save_model(clf)
print('Model is trained')
Exemple #22
0
def cache_abstraction(env, params):
    if os.path.exists('./out/{0}'.format(params['env_name'])):
        shutil.rmtree('./out/{0}'.format(params['env_name']))

    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'], params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = VAE(params['state_dim'], params['action_dim'])
    if params['use_cuda']:
        agent = agent.cuda()
        agent.load_state_dict(torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name'])))
    else:
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'], params['env_name']), map_location='cpu'))
    agent.eval()

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1):
        env_state = env.reset()
        episode_reward = 0.0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            # action, state_val = agent.sample_action_eval(var_state)
            action, state_val, code = agent.sample_action_eval_code(var_state)

            if not os.path.exists('./out/{0}/{1}'.format(params['env_name'], code)):
                os.makedirs('./out/{0}/{1}'.format(params['env_name'], code))
            preprocessor.get_img_state().save('./out/{0}/{1}/{2}.png'.format(params['env_name'], code, t))

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            episode_reward += reward

            if terminal:
                break

        episode_rewards.append(episode_reward)
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \
            .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100)
Exemple #23
0
def eval_agent_parallel(envs, params):
    preprocessors = []
    for _ in range(params['num_envs']):
        if params['use_preproc']:
            preprocessor = Preprocessor(params['state_dim'], params['history'],
                                        params['use_luminance'],
                                        params['resize_shape'])
            params['state_dim'] = preprocessor.state_shape
        else:
            preprocessor = None
        preprocessors.append(preprocessor)

    agent = agent_lookup(params)

    restore_model(agent, params['restore'], params['use_cuda'])
    if params['use_cuda']:
        agent.cuda()

    agent.eval()

    episode_rewards = []
    start = time.time()
    for episode in xrange(1, params['num_episodes'] + 1):
        env_states = [env.reset() for env in envs]
        states = [
            preprocessors[i].process_state(env_states[i])
            if preprocessors[i] else env_states[i] for i in range(len(envs))
        ]
        env_status = [False for _ in envs]
        episode_reward = [0.0 for _ in envs]
        for t in xrange(1, params['max_steps'] + 1):

            if reduce(lambda x, y: x and y, env_status):
                break

            for i, env in enumerate(envs):

                if params['env_render']:
                    env.render()

                if env_status[i]:
                    continue

                var_state = createVariable(states[i],
                                           use_cuda=params['use_cuda'])
                action, state_val = agent.sample_action_eval(var_state)

                reward = 0.0
                for _ in range(1):
                    env_states[i], r, terminal, _ = env.step(action)
                    reward += r
                    if terminal:
                        env_status[i] = True
                        break
                #
                episode_reward[i] += reward
                states[i] = preprocessors[i].process_state(
                    env_states[i]) if preprocessors[i] else env_states[i]

        for p in preprocessors:
            p.reset()

        episode_rewards.extend(episode_reward)

        if episode % params['print_every'] == 0:
            print 'Episode {0} | Total Reward {1} | Mean Reward {2} | Total Time {3} ' \
                .format(episode, episode_reward, sum(episode_rewards[-100:]) / 100,
                        timeSince(start, episode / params['num_episodes']))
Exemple #24
0
def cache_eval_episode(env, params):
    cache_states, cache_distros = [], []

    if params['use_preproc']:
        preprocessor = Preprocessor(params['state_dim'], params['history'],
                                    params['use_luminance'],
                                    params['resize_shape'])
        params['state_dim'] = preprocessor.state_shape
    else:
        preprocessor = None

    agent = agent_lookup(params)

    if params['use_cuda']:
        agent = agent.cuda()
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'],
                                                 params['env_name'])))
    else:
        agent.load_state_dict(
            torch.load('./agents/{0}_{1}'.format(params['arch'],
                                                 params['env_name']),
                       map_location='cpu'))

    agent_steps = 0
    episode_rewards = []
    start = time.time()
    for episode in xrange(1):
        env_state = env.reset()
        episode_reward = 0.0
        for t in xrange(1, params['max_steps'] + 1):
            if params['env_render']:
                env.render()

            if preprocessor:
                state = preprocessor.process_state(env_state)
            else:
                state = env_state

            var_state = createVariable(state, use_cuda=params['use_cuda'])
            action, state_val, distro = agent.sample_action_distro(var_state)
            cache_states.append(state)
            cache_distros.append(distro.cpu().numpy())

            reward = 0.0
            for _ in range(1):
                env_state, r, terminal, _ = env.step(action)
                reward += r
                if terminal:
                    break

            episode_reward += reward

            if terminal:
                break

        episode_rewards.append(episode_reward)
        agent_steps += t

        if preprocessor:
            preprocessor.reset()

        if episode % params['print_every'] == 0:
            print 'Episode {0} | Total Steps {1} | Total Reward {2} | Mean Reward {3}' \
                .format(episode, agent_steps, episode_reward, sum(episode_rewards[-100:]) / 100)

    cache_states, cache_distros = np.array(cache_states), np.array(
        cache_distros)
    pickle.dump((cache_states, cache_distros),
                open(
                    './out/{0}_{1}_episode.pkl'.format(params['arch'],
                                                       params['env_name']),
                    'wb'), -1)
class TestStatsCalculator(unittest2.TestCase):
    def setUp(self):
        self.stats_calc = StatsCalculator()
        self.preprocessor = Preprocessor()
        self.col_names = [f'feature_{i}' for i in range(FEATURES)]

    def tearDown(self):
        self.stats_calc = None
        self.preprocessor = None

    def _get_df(self):
        df = pd.read_csv('data/train.tsv', sep='\t')
        df = self.preprocessor.split_features(df)
        df = self.preprocessor.f_to_int(df)
        return df

    def test_mean_calc(self):
        df = self._get_df()
        col = random.choice(self.col_names)

        res = self.stats_calc.calc_mean(df, col)
        valid_res = np.mean(df[col])

        self.assertEqual(res, valid_res, "Wrong mean calculation")

    def test_std_calc(self):
        df = self._get_df()
        col = random.choice(self.col_names)

        res = self.stats_calc.calc_std(df, col)
        valid_res = np.std(df[col])

        self.assertEqual(res, valid_res, "Wrong std calculation")

    def test_speed(self):
        """
        Test parallelized mean calculation
        """
        df = self._get_df()
        col = random.choice(self.col_names)

        def wrapper(func):
            def inner(df, col, multiproc=False):
                start = time.time()
                result = func(df, col)
                end = time.time()
                print(f'\nResult of calculation: {result}')
                if multiproc:
                    print(f'Timing of calc in parallel: {end - start}')
                else:
                    print(f'Timing of sequential calc: {end - start}')
                return result

            return inner

        seq_calc = wrapper(self.stats_calc.calc_mean)
        res = seq_calc(df, col)
        parallel_calc = wrapper(self.stats_calc.calc_mean)
        parallel_calc(df, col, multiproc=True)

        true_value = np.mean(df[col])
        self.assertEqual(res, true_value, "Wrong mean calculation")
Exemple #26
0
 def setUp(self):
     self.stats_calc = StatsCalculator()
     self.preprocessor = Preprocessor()
     self.feature_adder = FeatureAdder()
     self.col_names = [f'feature_{i}' for i in range(FEATURES)]
Exemple #27
0
def train(**kwargs):
    kwargs["real_batch_size"] = kwargs["batch_size"] * kwargs["gradient_accumulation"]
    kwargs["train_dataset"] = kwargs["train_dataset"].split("-")
    kwargs["validation_dataset"] = kwargs["validation_dataset"].split("-")
    args = SimpleNamespace(**kwargs)
    if args.dryrun:
        os.environ['WANDB_MODE'] = 'dryrun'
    run = wandb.init(project="SAPAUT-PAUSES", name=args.run_name)
    if args.load:
        if not os.path.exists(f"models/{args.load}"):
            artifact = run.use_artifact(args.load + ":latest")
            artifact.download(root=f"models/{args.load}")
        args.model_name = f"models/{args.load}"
    special_tokens = ["<punct>"]
    if args.include_pauses:
        if not args.replace_pause:
            special_tokens.append("<pause>")
        ds_type = "ref-pauses"
    else:
        ds_type = "ref"
    if args.teacher_forcing:
        ds_type += "-tf"
    if args.tagging:
        ds_type += "-tag"
    if args.pause_threshold != 0.2:
        download_mode="reuse_cache_if_exists"
    else:
        download_mode="reuse_dataset_if_exists"
    ds_train = load_dataset(
        f"punctuation-iwslt2011/{args.train_dataset[0]}.py",
        ds_type,
        download_mode=download_mode,
        splits=[args.train_dataset[1]],
        ignore_verifications=True,
        lookahead_range=args.lookahead,
        pause_threshold=args.pause_threshold,
    )
    print("len", len(ds_train["validation"]))
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name, fast=True, additional_special_tokens=special_tokens, add_prefix_space=args.tagging
    )
    if not args.tagging:
        label_names = ds_train[args.train_dataset[1]].features["label"].names
    else:
        label_names = ds_train[args.train_dataset[1]].features["label"].feature.names
    preprocessor = Preprocessor(
        tokenizer,
        args,
        label_names,
        args.replace_pause,
        args.tagging,
    )
    ds_train = ds_train.map(
        preprocessor.preprocess, batched=False, num_proc=args.num_proc
    )
    ds_train.rename_column_("label", "labels")
    ds_valid = load_dataset(
        f"punctuation-iwslt2011/{args.validation_dataset[0]}.py",
        ds_type,
        download_mode=download_mode,
        splits=[args.validation_dataset[1]],
        ignore_verifications=True,
        lookahead_range=args.lookahead,
        pause_threshold=args.pause_threshold,
    )
    ds_valid = ds_valid.map(
        preprocessor.preprocess, batched=False, num_proc=args.num_proc
    )
    ds_valid.rename_column_("label", "labels")
    train = ds_train[args.train_dataset[1]]
    valid = ds_valid[args.validation_dataset[1]]
    train.shuffle(42)
    valid.shuffle(42)
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        weight_decay=args.weight_decay,
        logging_dir="./logs",
        logging_steps=args.log_steps,
        evaluation_strategy="steps",
        gradient_accumulation_steps=args.gradient_accumulation,
        eval_steps=args.log_steps,
    )

    config = AutoConfig.from_pretrained(
        args.model_name,
        num_labels=4,
    )

    if not args.tagging:
        model = AutoModelForSequenceClassification.from_pretrained(
            args.model_name, config=config
        )
    else:
        if args.bilstm:
            model = RobertaBiLSTMForTokenClassification.from_pretrained(
                args.model_name, config=config
            )
        else:
            model = AutoModelForTokenClassification.from_pretrained(
                args.model_name, config=config
            )
    model.resize_token_embeddings(len(tokenizer))

    optimizer = AdamW(
        [
            {"params": model.base_model.parameters()},
            {"params": model.classifier.parameters()},
        ],
        lr=args.lr,
        weight_decay=args.weight_decay,
    )

    if args.resample != "None":
        function_dict = {
            "Max": np.max,
            "Mean": np.mean,
            "Median": np.median,
        }
        np_function = function_dict[args.resample]
        mean_samples_excl_none = int(
            np_function(sorted(np.unique(train["labels"], return_counts=True)[1])[:-1])
        )
        per_class_samples = mean_samples_excl_none
        balanced_filter = np.concatenate(
            [
                np.where(np.array(train["labels"]) == i)[0][:per_class_samples]
                for i in range(4)
            ],
            axis=0,
        )
        train = train.select(balanced_filter)

    total_steps = len(train) // args.real_batch_size
    total_steps = total_steps * args.epochs
    schedule = get_linear_schedule_with_warmup(optimizer, total_steps // 2, total_steps)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        compute_metrics=preprocessor.compute_metrics,
        optimizers=(optimizer, schedule),
    )

    wandb.config.update(args.__dict__)

    if not args.no_train:
        trainer.train()

    if args.tagging:
        return
    
    lookahead_test = []
    for i in range(5):
        lookahead_test.append(
            valid.select(np.where(np.array(valid["lookahead"]) == i)[0])
        )
    la_metrics = []
    for l_test in lookahead_test:
        la_metrics.append(trainer.predict(l_test).metrics)

    for k in la_metrics[0].keys():
        data = [[i, m[k]] for i, m in enumerate(la_metrics)]
        table = wandb.Table(data=data, columns=["lookahead", k])
        wandb.log(
            {
                f"{k}_lookahead": wandb.plot.line(
                    table, "lookahead", k, title=f"{k} vs. lookahead"
                )
            }
        )

    if args.save:
        trainer.save_model(f"models/{args.save}")
        tokenizer.save_pretrained(f"models/{args.save}")
        model_artifact = wandb.Artifact(args.save, type="model")
        for path in glob.glob(f"models/{args.save}/**/*.*", recursive=True):
            model_artifact.add_file(path)
        wandb.run.log_artifact(model_artifact)

    for i in range(5):
        res_dict = {key: round(val * 100, 1) for key, val in la_metrics[i].items()}
        print(f"------- {i} ----------")
        print(
            "COMMA",
            res_dict["eval_precision_<comma>"],
            res_dict["eval_recall_<comma>"],
            res_dict["eval_f1_<comma>"],
        )
        print(
            "PERIOD",
            res_dict["eval_precision_<period>"],
            res_dict["eval_recall_<period>"],
            res_dict["eval_f1_<period>"],
        )
        print(
            "QUESTION",
            res_dict["eval_precision_<question>"],
            res_dict["eval_recall_<question>"],
            res_dict["eval_f1_<question>"],
        )
        print(
            "OVERALL",
            res_dict["eval_precision"],
            res_dict["eval_recall"],
            res_dict["eval_f1"],
        )
        print()
Exemple #28
0
CSV_FILES = ["sentiment140_labeled_done.csv"]

# first read in the data
data = dict()
for file in TRAINING_FILES:
    with open(os.path.join(TRAINING_DATA_PATH, file), "rb") as f:
        data[file] = pickle.load(f, encoding='latin1')

for file in CSV_FILES:
    data[file] = pd.read_csv(os.path.join(TRAINING_DATA_PATH, file),
                             encoding="ISO-8859-1",
                             header=0,
                             names=["id", "text", "labels"])

# second, preprocess the data for model ingestion
pp = Preprocessor(data, debug=False)
X_base_train, y_base_train, vocab_processor_base, X_base_test, y_base_test = pp.preprocess(
    datasource="s140")
X1_train, y1_train, vocab_processor1, X1_test, y1_test = pp.preprocess(
    datasource="scv1")
X2_train, y2_train, vocab_processor2, X2_test, y2_test = pp.preprocess(
    datasource="scv2")
X_val, y_val = pp.preprocess(datasource="s140", split=False)

# third, run the models
# baseline
cnn_model_base = SarcasmCNN(data=((X_base_train, y_base_train), (X_base_test,
                                                                 y_base_test)),
                            vocab_processor=vocab_processor_base)
print("Baseline Performance")
cnn_model_base.run()
Exemple #29
0
def main(train_csv, model_dir, mode):
    start_time = time.time()

    #df = pd.read_csv(args.train_csv, low_memory = False)
    df = pd.read_csv(train_csv)
    is_big = df.memory_usage().sum() > BIG_DATASET_SIZE

    # dict with data necessary to make predictions
    model_config = {}
    model_config['is_big'] = is_big

    preprocessor = Preprocessor()
    df_X, df_y = preprocessor.fit_transform(df)

    model_config['features'] = preprocessor.features

    print('Dataset read, shape {}'.format(df_X.shape))

    # fitting
    model_config['mode'] = mode
    if mode == 'regression':
        ridge_model = Ridge()

        cb_model = cb.CatBoostRegressor(
            iterations=300,
            boosting_type=('Ordered' if len(df_X) < 1000 else 'Plain'),
            od_type="IncToDec",
            depth=6,
            od_pval=0.0001,
            #learning_rate=0.03,
            loss_function='RMSE')
        models = [ridge_model, cb_model]
    else:
        log_reg_model = LogisticRegression()

        cb_model = cb.CatBoostClassifier(
            iterations=300,
            boosting_type=('Ordered' if len(df_X) < 1000 else 'Plain'),
            od_type="IncToDec",
            depth=6,
            od_pval=0.0001,
            #learning_rate=0.03,
            loss_function='Logloss',
            logging_level='Verbose')
        models = [log_reg_model, cb_model]

    for model in models:
        model.fit(df_X, df_y)

    D = [1 / np.std(model.predict(df_X) - df_y)**2 for model in models]
    s = sum(D)
    coef = [d / s for d in D]

    model = Model(models, coef)

    model_config['model'] = model

    model_config_filename = os.path.join(model_dir, 'model_config.pkl')
    with open(model_config_filename, 'wb') as fout:
        pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

    print('Train time: {}'.format(time.time() - start_time))
Exemple #30
0
import cv2
import numpy as np
from utils.Face_Detector import *
from utils.Preprocessor import *
from utils.Model_Loader import *

haar_face_detector = Haar_Face_Detector('./models/front_face_cascade.xml')
preprocessor = Preprocessor(96, 96)
model_loader = Model_Loader('./models/model_2.h5')
model_loader.load_model()

video_capture = cv2.VideoCapture(0)

video_capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
video_capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

while True:
    _, frame = video_capture.read()

    rois = haar_face_detector.detect(frame)

    try:
        x, y, w, h = rois[0]
        roi = frame[y:y + h, x:x + w]
        preprocessed_roi = preprocessor.process(roi)

        eye_x, eye_y = model_loader.get_coordinates(preprocessed_roi,
                                                    preprocessed_roi.shape[0],
                                                    preprocessed_roi.shape[1])

        # cv2.circle(preprocessed_roi, (int(eye_x[0]), int(eye_y[0])), 5, (0, 255, 0), -1)