def load_goals(self, task, model):
        print 'Reading in raw data from the task.'
        read_task_data = DataReader_Task(task, model)
        raw_data, raw_num, raw_reference, self.raw_reference_options = read_task_data.reset_goals()
        read_data = DataReader(subject=subject, data_start=data_start, reference_options=self.raw_reference_options,
                               data_finish=data_finish, model=model, task=task)

        # raw_data = read_data.get_raw_data()
        print 'Raw data is ready!'
        self.goal_data = read_data.generate_output_goals(test_goals=raw_data, test_number=raw_num, test_reference=raw_reference)
        # print 'Setting up openrave'
        # self.setup_openrave()
        # print 'I will now pick base locations to evaluate. They will share the same reachability score, but will have' \
        #       ' differing manipulability scores.'
        # print 'before sorting:'
        # for i in xrange(10):
        #     print self.scores[i]
        self.scores = np.array(sorted(self.scores, key=lambda t: (t[1][1], t[1][2]), reverse=True))
        # print 'after sorting:'
        # for i in xrange(10):
        #     print self.scores[i]
        self.best_base = self.scores[0]
        if self.best_base[1][1] == 0:
            print 'There are no base locations with reachable goals. Something went wrong in the scoring or the setup'
        print 'The best base location is: \n', self.best_base
        visualize_best = True
        if visualize_best:
            self.visualize_base_config(self.best_base, self.goal_data, self.raw_reference_options)
Beispiel #2
0
def train_lm(testing=False):
    data = DataReader(token_to_id_path, segment_sepparator)

    # Create model nodes for the source and target inputs
    input_sequence, label_sequence = create_inputs(data.vocab_dim)

    # Create the model. It has three output nodes
    # z: the input to softmax that  provides the latent representation of the next token
    # cross_entropy: this is used training criterion
    # error: this a binary indicator if the model predicts the correct token
    z, cross_entropy, error = create_model(input_sequence, label_sequence, data.vocab_dim, hidden_dim)

    # For measurement we use the (build in) full softmax.
    full_ce = C.cross_entropy_with_softmax(z, label_sequence)

    # print out some useful training information
    log_number_of_parameters(z) ; print()
    
    # Run the training loop
    num_trained_samples = 0
    num_trained_samples_since_last_report = 0

    # Instantiate the trainer object to drive the model training
    lr_schedule = C.learning_parameter_schedule_per_sample(learning_rate)
    momentum_schedule = C.momentum_schedule_per_sample(momentum_per_sample)
    gradient_clipping_with_truncation = True
    learner = momentum_sgd(z.parameters, lr_schedule, momentum_schedule,
                            gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                            gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, (cross_entropy, error), learner)

    last_avg_ce = 0
    for epoch_count in range(num_epochs):
        for features, labels, token_count in data.minibatch_generator(train_file_path, sequence_length, sequences_per_batch):
            arguments = ({input_sequence : features, label_sequence : labels})

            t_start = timeit.default_timer()
            trainer.train_minibatch(arguments)
            t_end =  timeit.default_timer()

            samples_per_second = token_count / (t_end - t_start)

            # Print progress report every num_samples_between_progress_report samples

            if num_trained_samples_since_last_report >= num_samples_between_progress_report or num_trained_samples == 0:
                av_ce = average_cross_entropy(full_ce, input_sequence, label_sequence, data)
                print_progress(samples_per_second, av_ce, num_trained_samples, t_start)
                num_trained_samples_since_last_report = 0
                last_avg_ce = av_ce

            num_trained_samples += token_count
            num_trained_samples_since_last_report += token_count

        if not testing:
            # after each epoch save the model
            model_filename = "models/lm_epoch%d.dnn" % epoch_count
            z.save(model_filename)
            print("Saved model to '%s'" % model_filename)

    return last_avg_ce
 def __init__(self):
     data_obj = DataReader()
     self.df = data_obj.get_pandas_df()
     self.pk = "NAGcode_1"
     self.selected_columns = ['statesup', 'defacto']
     self.write_columns = [
         "{}_dep_score".format(column) for column in self.selected_columns
     ]
def eval_ae():
    from Models.logistic_regression import MultiClassLogisticRegression
    from Models.random_forest import RandomForest
    from Models.naive_bayes import NaiveBayes
    from Models.svm import SVM
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat(
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw)
    # Train an auto encoder of size 4096
    encoder = get_encoder(train_x, test_x, 4096)
    # use auto encoder to encode the train, validate and test sets
    encoded_train = encoder.predict(train_x)
    encoded_test = encoder.predict(test_x)
    encoded_val = encoder.predict(val_x)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print('neural net ae')
    model = _get_nn_model_bag_of_words_simple_scratch(
        encoded_train,
        train_y,
        encoded_val,
        val_y,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(model, encoded_test, test_y)
    evaluate_model_nn(model, encoded_test, test_y)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print('logistic regression ae')
    model = MultiClassLogisticRegression()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print('random forest ae')
    model = RandomForest()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print('naive bayes ae')
    model = NaiveBayes()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)
Beispiel #5
0
def main(args):
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
    coord = tf.train.Coordinator()

    if args.mode == "train":
        with tf.compat.v1.name_scope('create_inputs'):
            data_reader = DataReader(
                data_dir=args.train_dir,
                data_list=args.train_list,
                mask_window=0.4,
                queue_size=args.batch_size * 3,
                coord=coord)
            if args.valid_list is not None:
                data_reader_valid = DataReader(
                    data_dir=args.valid_dir,
                    data_list=args.valid_list,
                    mask_window=0.4,
                    queue_size=args.batch_size * 2,
                    coord=coord)
                logging.info(
                    "Dataset size: train {}, valid {}".format(data_reader.num_data, data_reader_valid.num_data))
            else:
                data_reader_valid = None
                logging.info("Dataset size: train {}".format(data_reader.num_data))
        train_fn(args, data_reader, data_reader_valid)

    elif args.mode == "valid" or args.mode == "test":
        with tf.compat.v1.name_scope('create_inputs'):
            data_reader = DataReader_test(
                data_dir=args.data_dir,
                data_list=args.data_list,
                mask_window=0.4,
                queue_size=args.batch_size * 10,
                coord=coord)
        valid_fn(args, data_reader)

    elif args.mode == "pred":
        with tf.compat.v1.name_scope('create_inputs'):
            if args.input_mseed:
                data_reader = DataReader_mseed(
                    data_dir=args.data_dir,
                    data_list=args.data_list,
                    queue_size=args.batch_size * 10,
                    coord=coord,
                    input_length=args.input_length)
            else:
                data_reader = DataReader_pred(
                    data_dir=args.data_dir,
                    data_list=args.data_list,
                    queue_size=args.batch_size * 10,
                    coord=coord,
                    input_length=args.input_length)
        pred_fn(args, data_reader, log_dir=args.output_dir)

    else:
        print("mode should be: train, valid, test, pred or debug")

    return
Beispiel #6
0
class LyricGenRunner:
    def __init__(self, model_load_path, artist_name, test, prime_text):

        self.sess = tf.Session()
        self.artist_name = artist_name

        print 'Process data...'
        self.data_reader = DataReader(self.artist_name)
        self.vocab = self.data_reader.get_vocab()

        print 'Init model...'
        self.model = LSTMModel(self.sess,
                               self.vocab,
                               c.BATCH_SIZE,
                               c.SEQ_LEN,
                               c.CELL_SIZE,
                               c.NUM_LAYERS,
                               test=test)

        print 'Init variables...'
        self.saver = tf.train.Saver(max_to_keep=None)
        self.sess.run(tf.initialize_all_variables())

        if model_load_path is not None:
            self.saver.restore(self.sess, model_load_path)
            print 'Model restored from ' + model_load_path

        if test:
            self.test(prime_text)
        else:
            self.train()

    def train(self):
        while True:
            inputs, targets = self.data_reader.get_train_batch(
                c.BATCH_SIZE, c.SEQ_LEN)
            print 'Training model...'

            feed_dict = {
                self.model.inputs: inputs,
                self.model.targets: targets
            }
            global_step, loss, _ = self.sess.run(
                [self.model.global_step, self.model.loss, self.model.train_op],
                feed_dict=feed_dict)

            print 'Step: %d | loss: %f' % (global_step, loss)
            if global_step % c.MODEL_SAVE_FREQ == 0:
                print 'Saving model...'
                self.saver.save(self.sess,
                                join(c.MODEL_SAVE_DIR,
                                     self.artist_name + '.ckpt'),
                                global_step=global_step)

    def test(self, prime_text):
        sample = self.model.generate(prime=prime_text)

        print sample
Beispiel #7
0
def dimensionality_RF(instruction, dataset, target="", y="", n_features=10):
    global currLog
    global counter

    dataReader = DataReader("./data/" + get_last_file()[0])

    if target == "":
        data = dataReader.data_generator()
        data.fillna(0, inplace=True)
        remove = get_similar_column(get_value_instruction(instruction), data)
        data = structured_preprocesser(data)

        y = data[remove]
        del data[remove]
        le = preprocessing.LabelEncoder()
        y = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(dataset,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=49)
    first_classifier = tree.DecisionTreeClassifier()
    first_classifier.fit(X_train, y_train)

    first_classifier_acc = accuracy_score(first_classifier.predict(X_test),
                                          y_test)

    accuracy_scores = [first_classifier_acc]
    columns = []
    datas = []
    datas.append(dataset)
    columns.append([])

    for i, x in product(range(3, 10), range(4, len(dataset.columns))):
        feature_model = RandomForestRegressor(random_state=1, max_depth=i)
        feature_model.fit(X_train, y_train)

        importances = feature_model.feature_importances_
        indices = np.argsort(importances)[-x:]
        columns.append(dataset.columns[indices])

        X_temp_train = X_train[dataset.columns[indices]]
        X_temp_test = X_test[dataset.columns[indices]]

        val = pd.DataFrame(np.r_[X_temp_train, X_temp_test])
        val[target] = np.r_[y_train, y_test]
        datas.append(val)

        vr = tree.DecisionTreeClassifier()
        vr.fit(X_temp_train, y_train)

        accuracy_scores.append(accuracy_score(vr.predict(X_temp_test), y_test))

    the_index = accuracy_scores.index(max(accuracy_scores))

    return datas[the_index], accuracy_scores[0], max(accuracy_scores), list(
        columns[the_index])
Beispiel #8
0
def main(_):
    config = tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement)
    with tf.Session(config=config) as sess:
        print('\n{} Model initializing'.format(datetime.now()))

        model = VistaNet(FLAGS.hidden_dim, FLAGS.att_dim, FLAGS.emb_size,
                         FLAGS.num_images, FLAGS.num_classes)
        loss = loss_fn(model.labels, model.logits)
        train_op = train_fn(loss, model.global_step)
        accuracy = eval_fn(model.labels, model.logits)
        summary_op = tf.summary.merge_all()

        sess.run(tf.global_variables_initializer())
        train_summary_writer.add_graph(sess.graph)
        saver = tf.train.Saver(max_to_keep=FLAGS.num_checkpoints)
        data_reader = DataReader(num_images=FLAGS.num_images,
                                 train_shuffle=True)

        print('\n{} Start training'.format(datetime.now()))

        epoch = 0
        best_loss = float('inf')
        while epoch < FLAGS.num_epochs:
            epoch += 1
            print('\n=> Epoch: {}'.format(epoch))

            train(sess, data_reader, model, train_op, loss, accuracy,
                  summary_op)

            print('=> Evaluation')
            print('best_loss={:.4f}'.format(best_loss))
            valid_loss, valid_acc = evaluate(
                sess, data_reader.read_valid_set(batch_size=FLAGS.batch_size),
                model, loss, accuracy, summary_op)
            print('valid_loss={:.4f}, valid_acc={:.4f}'.format(
                valid_loss, valid_acc))

            if valid_loss < best_loss:
                best_loss = valid_loss
                save_path = os.path.join(
                    FLAGS.checkpoint_dir,
                    'epoch={}-loss={:.4f}-acc={:.4f}'.format(
                        epoch, valid_loss, valid_acc))
                saver.save(sess, save_path)
                print('Best model saved @ {}'.format(save_path))

                print('=> Testing')
                result_file = open(
                    os.path.join(
                        FLAGS.log_dir,
                        'loss={:.4f},acc={:.4f},epoch={}'.format(
                            valid_loss, valid_acc, epoch)), 'w')
                test(sess, data_reader, model, loss, accuracy, epoch,
                     result_file)

    print("{} Optimization Finished!".format(datetime.now()))
    def build_cnv_training_data(self, data_dir, outcome_file):
        excel_obj = ExcelReader()
        data_reader_obj = DataReader()

        outcome_dict = excel_obj.get_cyto_cnv_result(outcome_file)
        cnv_df = data_reader_obj.cnv_data_reader_pipeline(data_dir)

        data_df = data_reader_obj.combine_outcome_data(cnv_df, outcome_dict)

        return data_df
 def __init__(self, folder, type, num, batchSize, steps, numFeatures):
     self.master_filepath = folder  # the master filepath in which all of the data is located
     self.len = int(np.ceil(num / float(batchSize)))
     self.num = num
     self.numFeatures = numFeatures
     self.batchSize = batchSize
     self.steps = steps
     reader = DataReader(self.master_filepath)
     self.data = reader.get_data(True, True)
     return
Beispiel #11
0
 def __init__(self):
     # 获取字典
     self.dealer = DataDealer(ANSWERS_DICT_PATH)
     # 获取样本集信息
     self.reader = DataReader(TRAIN_DATA_TYPE)
     self.reader.set_pos()
     self.weight_vgg = None
     self.biase_vgg = None
     self.model_word2vec = gensim.models.KeyedVectors.load_word2vec_format(
         GLOVE_WIKI_GENSIM_DATA_PATH)
Beispiel #12
0
def createQuestionsDict():
    """
    创建问题字典(包含回答字典)
    """
    reader = DataReader()
    reader.set_pos()
    dealer = DataDealer(ANSWERS_DICT_PATH)
    start_id = reader.get_next_pic_id()
    qa = reader.get_pic_qa(start_id)
    for q in qa:
        question = q['question']
        dealer.deal(question)
    now_id = reader.get_next_pic_id()
    i = 0
    while now_id != start_id:
        qa = reader.get_pic_qa(now_id)
        for q in qa:
            question = q['question']
            dealer.deal(question)
        now_id = reader.get_next_pic_id()
        i = i + 1
        if i % 1000 == 0:
            print('*', end='')
    dealer.saveData(QUESTIONS_DICT_PATH)
    print('over!')
Beispiel #13
0
def data_training():
    """
    仅用样本集进行训练
    """
    sentences = []
    reader = DataReader(TRAIN_DATA_TYPE)
    reader.set_pos()
    start_id = reader.get_next_pic_id()
    qa = reader.get_pic_qa(start_id)
    for q in qa:
        question = q['question']
        question = question.replace('?', ' ?')
        question = question.replace(',', ' ,')
        question = question.replace('.', ' .')
        sentence = question.split(' ')
        sentences.append(sentence)
    now_id = reader.get_next_pic_id()
    i = 0
    while now_id != start_id:
        qa = reader.get_pic_qa(now_id)
        for q in qa:
            question = q['question']
            question = question.replace('?', ' ?')
            question = question.replace(',', ' ,')
            question = question.replace('.', ' .')
            sentence = question.split(' ')
            sentences.append(sentence)
        now_id = reader.get_next_pic_id()
        i = i + 1
        if i % 1000 == 0:
            print('*', end='')
    print('load data over!')
    model = gensim.models.Word2Vec(sentences, size=300, min_count=1)
    model.save(GENSIM_DATA_PATH)
Beispiel #14
0
 def __init__(self):
     data_obj = DataReader()
     self.pk = "nagcode_1"
     self.df = data_obj.get_pandas_df()
     self.mode_map = {
         "active": "statesup",
         "defacto": "defacto"
     }
     self.memory = {}
     centralities = ["in-degree", "betweenness", "closeness"]
     self.selected_columns = ["{}_{}_centrality".format(mode, c) for c in centralities for mode in self.mode_map ]
Beispiel #15
0
def nearest_neighbors(instruction=None,
                      dataset=None,
                      mca_threshold=None,
                      preprocess=True,
                      drop=None,
                      min_neighbors=3,
                      max_neighbors=10):
    logger("Reading in dataset....")
    # Reads in dataset
    # data = pd.read_csv(self.dataset)
    dataReader = DataReader(dataset)
    data = dataReader.data_generator()
    if drop is not None:
        data.drop(drop, axis=1, inplace=True)
    data, y, remove, full_pipeline = initial_preprocesser(
        data, instruction, preprocess, mca_threshold)
    logger("->", "Target Column Found: {}".format(remove))
    X_train = data['train']
    y_train = y['train']
    X_test = data['test']
    y_test = y['test']
    # classification_column = get_similar_column(getLabelwithInstruction(instruction), data)
    num_classes = len(np.unique(y))
    # encodes the label dataset into 0's and 1's
    y_vals = np.unique(pd.concat([y['train'], y['test']], axis=0))
    label_mappings = {}
    for i in range(len(y_vals)):
        label_mappings[y_vals[i]] = i
    y_train = y_train.apply(lambda x: label_mappings[x]).values
    y_test = y_test.apply(lambda x: label_mappings[x]).values
    models = []
    scores = []
    logger("Fitting Nearest Neighbor...")
    logger("Identifying optimal number of neighbors...")
    # Tries all neighbor possibilities, based on either defaults or user
    # specified values
    for x in range(min_neighbors, max_neighbors):
        knn = KNeighborsClassifier(n_neighbors=x)
        knn.fit(X_train, y_train)
        models.append(knn)
        scores.append(accuracy_score(knn.predict(X_test), y_test))
    logger("Stored model under 'nearest_neighbors' key")
    knn = models[scores.index(min(scores))]
    return {
        'id': generate_id(),
        "model": knn,
        "accuracy_score": scores.index(min(scores)),
        "preprocesser": full_pipeline,
        "interpreter": label_mappings,
        "target": remove,
        "cross_val_score": cross_val_score(knn, X_train, y_train, cv=3)
    }
    clearLog()
Beispiel #16
0
 def get_aggregate_columns(self, columns):
     data_obj = DataReader()
     df = data_obj.get_pandas_df()
     df = df.groupby(self.pk).agg(set).reset_index()
     df = df[columns]
     df['number_of_supporters'] = df['supporter'].apply(
         lambda x: len([i for i in x if not pd.isna(i)]))
     df['number_of_targets'] = df['target'].apply(
         lambda x: len([i for i in x if not pd.isna(i)]))
     df['support_target_ratio'] = df['number_of_supporters'] / df[
         'number_of_targets']
     return df
Beispiel #17
0
def prepareData():
    """prepare you dataset here"""
    # with open('all.data', 'rb') as f:
    #     df = pickle.load(f)
    # df = df[~pd.isnull(df.is_trade)]

    reader = DataReader() 
    # features = ["average_day_active_time","average_login_interval", "average_spin_interval", "average_bonus_win", "spin_per_active_day", "bonus_per_active_day","average_bet", "bonus_ratio", "free_spin_ratio", "coin"]

    df = reader.read("slot_purchase_profile_2017")
    
    return df
Beispiel #18
0
    def load_exp_data(self, path, filename):
        """Charge les valeurs expérimentales depuis le fichier `filename` situé
        dans le dossier `path`. Si `filename` est une liste ou un tuple, seule
        la première case est considérée.
        
        Paramètres
        ----------
        path : str
            Chemin du fichier à charger.
        filename : str or list-like of str
            Nom du fichier à charger.
        
        Retour
        ------
        Retourne None si la lecture s'est bien passée, retourne l'erreur sinon.
        """
        try:
            reader = DataReader(os.path.join(path, filename))
        except FileNotFoundError as err:
            print(err)
            return err
        except ValueError as err:
            print("ValueError: ", err)
            return err
        except Exception as err:
            print(err)
            return err

        self.exptRaw = reader.get_t()
        self.expIRaw = reader.get_I()
        self.expt = self.exptRaw
        self.expI = self.expIRaw

        #Pour la modification d'intervalle
        self.valIntervalMin = (min(self.expt))
        self.valIntervalMax = (max(self.expt))

        self.mainGraph.set_experimental_data(self.expt, self.expI)

        #Recalcule les valeurs théoriques pour coller avec l'étendue des valeurs
        #expérimentales
        self.t = cm.create_t(0, max(self.expt), 1000)
        self.I = cm.cottrell_curve_gen(self.valN, self.valS, self.valC,
                                       self.valDth, self.t)
        self.mainGraph.set_theoric_data(self.t, self.I)

        self.mainGraph.set_limit_interval()
        self.mainGraph.update()

        self.expDataLoaded = True

        return None
    def test(self):

        batch_size = 4
        num_unroll_steps = 3
        char_vocab_size = 51
        max_word_length = 11
        char_embed_size = 3

        _, _, word_data, char_data, _ = load_data('data/', max_word_length)
        dataset = char_data['train']
        self.assertEqual(dataset.shape, (929589, max_word_length))

        reader = DataReader(word_data['train'],
                            char_data['train'],
                            batch_size=batch_size,
                            num_unroll_steps=num_unroll_steps)
        for x, y in reader.iter():
            assert x.shape == (batch_size, num_unroll_steps, max_word_length)
            break

        self.assertAllClose(X, x)
        self.assertAllClose(Y, y)

        with self.test_session() as session:
            input_ = tf.placeholder(
                tf.int32,
                shape=[batch_size, num_unroll_steps, max_word_length],
                name="input")
            ''' First, embed characters '''
            with tf.variable_scope('Embedding'):
                char_embedding = tf.get_variable(
                    'char_embedding', [char_vocab_size, char_embed_size])

                # [batch_size x max_word_length, num_unroll_steps, char_embed_size]
                input_embedded = tf.nn.embedding_lookup(char_embedding, input_)

                input_embedded = tf.reshape(
                    input_embedded, [-1, max_word_length, char_embed_size])

            session.run(tf.assign(char_embedding, EMBEDDING))
            ie = session.run(input_embedded, {input_: x})

            #print(x.shape)
            #print(np.transpose(x, (1, 0, 2)))
            #print(ie.shape)
            ie = ie.reshape([
                batch_size, num_unroll_steps, max_word_length, char_embed_size
            ])
            ie = np.transpose(ie, (1, 0, 2, 3))
            #print(ie[0,:,:,:])

            self.assertAllClose(IE3, ie[0, :, :, :])
 def __init__(self, model_name='model', test=False):
     self.session = tf.Session()
     print('Process data...')
     self.data_reader = DataReader()
     self.vocab = self.data_reader.get_vocab()
     print('Init model...')
     self.model = Model(self.session, self.vocab, c.BATCH_SIZE, c.SEQ_LEN,
                        c.CELL_SIZE, c.NUM_LAYERS, test)
     print('Init variables...')
     self.test = test
     self.saver = tf.train.Saver(max_to_keep=None)
     self.session.run(tf.global_variables_initializer())
     self.model_name = model_name
Beispiel #21
0
 def __init__(self,
              root_dir,
              up_level,
              save_dir='captions',
              bad_words_dict=set()):
     encoder = VQAMaskRCNNBenchmark()
     captioner = PythiaCaptioner(use_constrained=True)
     self.model = PythiaBUTD(encoder=encoder, captioner=captioner)
     self.model.to(device)
     self.data_iterator = DataReader(root_dir)
     self.bad_words_dict = bad_words_dict
     self.up_level = up_level
     self.captions = {}
     self.save_dir = save_dir
Beispiel #22
0
    def __init__(self, fmt, filepath, sampling_interval):
        self.interval = datetime.timedelta(minutes=sampling_interval)
        reader = DataReader(fmt, filepath, self.interval)
        self.raw_data = reader.read()
        self.data = list(self.raw_data)
        print(f"Reading {len(self.data)} segments")

        self.sampling_horizon, self.prediction_horizon = 0, 0
        self.scale, self.train_test_ratio = 0, 0
        self.n, self.set_cutpoint = len(self.data), False
        self.train_x, self.train_y, self.train_weights = None, None, None
        self.test_x, self.test_y = None, None
        self.train_n, self.test_n = 0, 0
        self.train_idx = None
def data_gen(train_or_test='train') -> Tuple:
    data_reader = DataReader(dataset=scene_name,
                             context_size=CONTEXT_SIZE,
                             root=root_path,
                             mode=train_or_test)
    while True:
        data = data_reader.read(batch_size=12)
        query: Query = data[0]
        target_img_batch: np.ndarray = data[1]
        context: Context = query[0]
        query_camera_batch: np.ndarray = query[1]
        context_images: np.ndarray = context[0]
        context_cameras: np.ndarray = context[1]
        yield target_img_batch, target_img_batch
Beispiel #24
0
def main():
	#Prepare dataset from csv to npz files
	#DatasetPreparation.prepare('train_preprocessed.csv','test_preprocessed.csv')
	
	#Read the dataset, create batches, and one hot encode the targets
	batch_size = 100
	train_data = DataReader('train.npz',batch_size)
	validation_data = DataReader('validation.npz')
	
	test_data = np.load('test.npz')

	m = Model(train_data,validation_data)
	m.train()
	
	m.test(test_data)	
    def test(self):
        # Find the TESTDATA_FILE in the same directory as this script file.
        dir_path = os.path.dirname(os.path.realpath(__file__))
        testdata_path = os.path.join(dir_path, self.TESTDATA_FILE)

        # Read each test case (data chunk) and verify the expected schema.
        with open(testdata_path) as testdatafile:
            data_reader = DataReader(testdatafile)
            chunk_count = 0
            while True:
                chunk = data_reader.read_chunk()
                if chunk is None:
                    break
                chunk_count += 1
                self.verify_data_chunk(chunk_count, chunk)
Beispiel #26
0
def indexView():
    '''
    Renders the template for the index.
    '''
    #     if 'pond_pic_visible' not in session:
    #         session['pond_pic_visible']='visible'

    #http://runnable.com/UiPcaBXaxGNYAAAL/how-to-upload-a-uploaded_file-to-the-server-in-flask-for-python
    if request.method == 'POST':  #true if the button "upload" is clicked
        # Get the name of the uploaded uploaded_file
        uploaded_file = request.files['uploaded_file']

        # Check if the uploaded_file is one of the allowed types/extensions
        if uploaded_file and allowed_file(uploaded_file.filename):

            pond_file = request.files['uploaded_file']

            try:
                reader = DataReader(
                    "")  #I don't plan on using this filename, thanks
                pond_list = reader.readFile(
                    pond_file.read()
                )  #read method is http://werkzeug.pocoo.org/docs/0.10/datastructures/#werkzeug.datastructures.FileStorage,
            except Exception as e:
                print "error in getPondList"
                print str(e)
                return render_template(INTERNAL_SERVER_ERROR_TEMPLATE_ROUTE,
                                       error=str(e))

            ##################################################################
            #let's try something. AARDVARK <--easy to search for this
            #(this might be more work than making Pond objects serializable)
            ##################################################################
            ##trying http://jsonpickle.github.io/
            pickle_pond_list(pond_list)

            return redirect(url_for("primary_production"))

        else:
            error_message = "Apologies, that file extension is not allowed. Please try one of the allowed extensions."
            return render_template('home_with_error.html',
                                   template_file_route=TEMPLATE_FILE_ROUTE,
                                   example_file_route=EXAMPLE_FILE_ROUTE,
                                   error_message=error_message)

    return render_template('home.html',
                           template_file_route=TEMPLATE_FILE_ROUTE,
                           example_file_route=EXAMPLE_FILE_ROUTE)
Beispiel #27
0
    def __init__(self, input_file, vocabulary_file, img_data_file,
                 char2ix_file, output_dir, maxwordlength, emb_dimension,
                 line_batch_size, sample_batch_size, neg_num, window_size,
                 discard, epochs, initial_lr, seed):

        torch.manual_seed(seed)
        self.img_data = np.load(img_data_file)
        self.data = DataReader(input_file, vocabulary_file, char2ix_file,
                               maxwordlength, discard, seed)
        dataset = Word2vecDataset(self.data, window_size, sample_batch_size,
                                  neg_num)
        self.dataloader = DataLoader(dataset,
                                     batch_size=line_batch_size,
                                     shuffle=True,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_dir = output_dir
        self.emb_size = len(self.data.word2id)
        self.char_size = len(self.data.char2id) + 1  #5031
        self.emb_dimension = emb_dimension
        self.line_batch_size = line_batch_size
        self.epochs = epochs
        self.initial_lr = initial_lr
        self.VCWE_model = VCWEModel(self.emb_size, self.emb_dimension,
                                    self.data.wordid2charid, self.char_size)
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        self.num_train_steps = int(len(self.dataloader) * self.epochs)
        if self.use_cuda:
            self.VCWE_model.cuda()
    def __init__(self,
                 input_file,
                 antonym_file,
                 output_file,
                 emb_dimension=100,
                 batch_size=32,
                 window_size=5,
                 iterations=3,
                 initial_lr=0.001,
                 min_count=12):

        print("Reading input file...")
        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        print("Creating data batches")
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)
        self.antonym_file = open(antonym_file, 'r')

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.skip_gram_model.cuda()
Beispiel #29
0
    def readData(self, path_to_data, path_to_energy):
        """
        Reads in weather data from a file and stores it
        """

        if path_to_data == None:
            weather_reader = RandomReader(365 * 24)
        else:
            weather_reader = DataReader(path_to_data, path_to_energy)

        while weather_reader.canGetForecast():
            forecast = weather_reader.getForecast(
            )  #forecast = list of 24 tuples of (windSpeed, sunlight, energy_needed)
            # store raw numbers
            self.raw_data.append(copy.deepcopy(forecast[0]))
            self.energy_needed.append(forecast[0].ERCOT)
            self.energy_gained.append(
                (self.calculate_wind_power(forecast[0].windSpeed),
                 self.calculate_solar_power(forecast[0].sunlight),
                 self.calculate_hydro_power()))
            # calculate features
            wind_power = 0.0
            solar_power = 0.0
            hydro_power = 0.0
            for weather_tuple in forecast:
                #convert weather to power
                wind_power += self.calculate_wind_power(
                    weather_tuple.windSpeed)
                solar_power += self.calculate_solar_power(
                    weather_tuple.sunlight)
                hydro_power += self.calculate_hydro_power()
            self.features.append((wind_power, solar_power, hydro_power))
            weather_reader.advanceTime()
Beispiel #30
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=300,
                 batch_size=64,
                 window_size=5,
                 iterations=5,
                 initial_lr=1.0,
                 min_count=5):

        self.data = DataReader(input_file, min_count)
        dataset = Word2vecDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            print("USING CUDA")
            self.skip_gram_model.cuda()
        else:
            print("CUDA FAIL")
Beispiel #31
0
    def __init__(self,
                 input_file,
                 output_file,
                 emb_dimension=500,
                 batch_size=32,
                 window_size=5,
                 iterations=5,
                 initial_lr=0.001,
                 min_count=12):

        self.data = DataReader(input_file, min_count)
        dataset = PennDataset(self.data, window_size)
        self.dataloader = DataLoader(dataset,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=0,
                                     collate_fn=dataset.collate)

        self.output_file_name = output_file
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.iterations = iterations
        self.initial_lr = initial_lr
        self.penn_skip_gram_model = PennSkipGramModel(self.emb_size,
                                                      self.emb_dimension)

        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        if self.use_cuda:
            self.penn_skip_gram_model.cuda()
    def read_data(self):

        self._logger.info("Reading meta data...")

        self._reader = DataReader(self._logger)

        (
            self._vocab,
            self._vocab_size,
            self._dictionary,
            self._reverse_dictionary,
            self._unigrams,
            self._arts_srcs,
            self._srcs_ents,
            self._ents_srcs,
        ) = self._reader.read_meta_files(self._args.data)

        with open(self._args.output + "-labels-dict.pkl", "wb") as f:
            cPickle.dump(self._reverse_dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL)

        with open(self._args.output + "-vocab-dict.pkl", "wb") as f:
            cPickle.dump(self._dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL)

        self._number_of_srcs = len(set(self._srcs_ents.keys()))

        self._sample_dist()
Beispiel #33
0
    def run(self, generation):
        runned_generation = list()
        data_reader = DataReader.getInstance()
        X, Y, X_test, X_output = data_reader.read_data()

        #for each gene of this generation
        for i in range(0, len(generation)):

            this_gene = generation[i]
            # runner is which algorithm will I use:
            # 0 is XGBoost Classifier
            # 1 is XGBoost regressor
            # 2 is SVC
            # 3 is DecisionTreeClassifier
            # 4 is AdaBoost applied to DecisionTreeClassifier
            # 5 is GradientBoosting
            # 6 is KNeighbors
            # 7 is RandomForest
            # 8 is RandomForest but simplified (more defaults and less configuration)
            runner = None
            if (this_gene.way == 0):
                runner = TitanicBoostClassifier()
            else:
                if (this_gene.way == 1):
                    runner = TitanicBoostRegressor()
                else:
                    runner = VariousForests()

            runner.set_datasets(X, Y, X_test, X_output)
            runner.set_gene_to_model(this_gene)  #here we configure the model
            this_gene.set_fitness_level(runner.run())
            runned_generation.append(this_gene)

        return runned_generation
Beispiel #34
0
    def __init__(self):

        self.data_reader = DataReader('data/training_data/training.data', 'data/stopwords/stopwords.txt', True, 1000)
        self.perceptron = Perceptron()
        self.softmax = Softmax()
        # Let's create 5 classifiers
        universe_size = len(self.data_reader.universe)
        self.perceptron_classifiers = [np.zeros((universe_size + 1)) for i in range(5)]
        self.softmax_classifier = np.ones((5, universe_size + 1))
Beispiel #35
0
def run_test2(session, m, reader):
    state = session.run(m.initial_rnn_state)
    tokenNum = 0
    word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \
        load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS)
    train_reader = DataReader(word_tensors['train'], char_tensors['train'],1, 1)

    i = 1
    for x, y in train_reader.iter():
        state = session.run([m.final_rnn_state], {
            m.input: x,
            m.targets: y,
            m.initial_rnn_state: state
        })

        # constructs the word_embedding (which is the input node to the LSTM)
        # NOTE: each element is an index to a character_embedding.  thus, it's
        # actually a matrix
        word_embedding = x[0][0]
        output = ""
        for w in word_embedding:
            output = output + str(w) + " "
        output = output.rstrip() + ","

        #print ("char_embedding[1]:" + str(session.run(m.char_embedding[1])))

        i = i + 1
        layer1 = state[0][0]
        layer2 = state[0][1]
        layer1_hiddens = layer1[1][0]
        layer2_hiddens = layer2[1][0]
        for x in layer1_hiddens:
            output = output + str(x) + " "
        output = output.rstrip() + ","

        for x in layer2_hiddens:
            output = output + str(x) + " "
        output = output.rstrip() + "\n"

        print (output)
Beispiel #36
0
def run():
    lines = DataReader.read('car.data.txt')
    training_inputs = DataReader.parse_data(lines)

    print "Initializing Network..."
    my_network = Network(number_of_centers=NUMBER_OF_CENTERS,
                         training=TRAINING_ITERATIONS)
    print "Done."

    print "Starting training. {} centers / {} iterations".\
        format(NUMBER_OF_CENTERS, TRAINING_ITERATIONS)
    my_network.train(training_inputs)
    print "Done."

    # TODO(Accuracy): Test accuracy with non training data.
    right = 0
    total_tests = 100
    for i in range(total_tests):
        chosen = random.choice(training_inputs)
        response = my_network.classify(chosen['inputs'])
        if response == chosen['expected']:
            right += 1
    print "Accuracy => {}/{}".format(right, total_tests)
    def generate_score(self):

        # for item in data:
        #     print Bmat_to_pos_quat(item)

        num = np.ones([len(self.goals), 1])
        reference_options = ['head']
        reference = np.zeros([len(self.goals), 1])

        print 'Starting to convert data!'
        run_data = DataReader(subject=self.subject, model=self.model, task=self.task)
        run_data.receive_input_data(self.goals, num, reference_options, reference)
        run_data.generate_output_goals()
        run_data.generate_score(viz_rviz=True, visualize=self.visualize, plot=False)
    def __init__(self, visualize=False, subject='any_subject', task='yogurt', model='chair', tf_listener=None):

        self.model = model
        self.task = task
        self.subject = subject

        baselink_B_liftlink = createBMatrix([-0.05, 0.0, 0.8897], [0, 0, 0, 1])

        goals = [[0.301033944729, 0.461276517595, 0.196885866571,
                  0.553557277528, 0.336724229346, -0.075691681684, 0.757932650828],
                 [0.377839595079, 0.11569018662, 0.0419789999723,
                  0.66106069088, 0.337429642677, -0.519856214523, 0.422953367233],
                 [0.2741387011303321, 0.005522571699560719, -0.011919598309888757,
                 -0.023580897114171894, 0.7483633417869068, 0.662774596931439, 0.011228696415565394],
                 [0.13608632401364894, 0.003540318703608347, 0.00607600258150498,
                  -0.015224467044577382, 0.7345761465214938, 0.6783020152473445, -0.008513323454022942]]
        liftlink_B_goal = createBMatrix([0.5309877259429142, 0.4976163448816489, 0.16719537682372823],
                                        [0.7765742993649133, -0.37100605554316285, -0.27784851903166524,
                                         0.42671660945891])
        data = np.array([baselink_B_liftlink*createBMatrix(goals[0][0:3], goals[0][3:]),  # In reference to base link
                         baselink_B_liftlink*createBMatrix(goals[1][0:3], goals[1][3:]),  # In reference to base link
                         createBMatrix(goals[2][0:3], goals[2][3:]),
                         createBMatrix(goals[3][0:3], goals[3][3:])])  # This one is in reference to the head

        for item in data:
            print Bmat_to_pos_quat(item)

        # For my information, these are the [xyz] and quaternion [x,y,z,w] for the PoseStamped messages for the goal
        # positions. The first two have parent tf /base_link. The third has parent link /head
        # (array([ 0.48098773,  0.49761634,  0.91837238]), array([ 0.7765743 , -0.37100606, -0.27784852,  0.42671661]))
        # (array([ 0.4598544 ,  0.8806009 ,  0.65371782]), array([ 0.45253993,  0.53399713, -0.17283745,  0.69295158]))
        # (array([ 0.2741387 ,  0.05522572, -0.0119196 ]), array([-0.0235809 ,  0.74836334,  0.6627746 ,  0.0112287 ]))

        num = np.ones([len(data), 1])
        reference_options = ['head', 'base_link']
        reference = np.array([[1], [1], [0], [0]])



        print 'Starting to convert data!'
        runData = DataReader(subject=self.subject, model=self.model, task=self.task)

        runData.receive_input_data(data, num, reference_options, reference)
        runData.generate_output_goals()
        runData.generate_score(viz_rviz=True, visualize=False, plot=False)
    def test(self):

        batch_size = 4
        num_unroll_steps = 3
        char_vocab_size = 51
        max_word_length = 11
        char_embed_size = 3

        _, _, word_data, char_data, _ = load_data('data/', max_word_length)
        dataset = char_data['train']
        self.assertEqual(dataset.shape, (929589, max_word_length))

        reader = DataReader(word_data['train'], char_data['train'], batch_size=batch_size, num_unroll_steps=num_unroll_steps)
        for x, y in reader.iter():
            assert x.shape == (batch_size, num_unroll_steps, max_word_length)
            break

        self.assertAllClose(X, x)

        with self.test_session() as session:
            input_ = tf.placeholder(tf.int32, shape=[batch_size, num_unroll_steps, max_word_length], name="input")

            ''' First, embed characters '''
            with tf.variable_scope('Embedding'):
                char_embedding = tf.get_variable('char_embedding', [char_vocab_size, char_embed_size])

                # [batch_size x max_word_length, num_unroll_steps, char_embed_size]
                input_embedded = tf.nn.embedding_lookup(char_embedding, input_)

                input_embedded = tf.reshape(input_embedded, [-1, max_word_length, char_embed_size])

            session.run(tf.assign(char_embedding, EMBEDDING))
            ie = session.run(input_embedded, {
                input_: x
            })

            output = tdnn(input_embedded, [2], [2], scope='TDNN')

            out = session.run(output, {
                    input_embedded: ie,
                    'TDNN/kernel_2/w:0': np.reshape(np.transpose(KERNEL_2_W), [1, 2, num_unroll_steps, 2]),
                    'TDNN/kernel_2/b:0': KERNEL_2_B
            })

            out = out.reshape([batch_size, num_unroll_steps, 2])
            out = out.transpose([1, 0, 2])  # torch uses time-major order

            self.assertAllClose(out, np.array([
 [[-0.04201929,  0.02275813],
  [-0.04060676,  0.02283999],
  [-0.04333816,  0.02333505],
  [-0.04131923,  0.02480407]],

 [[-0.04124087,  0.02429205],
  [-0.04117644,  0.02419558],
  [-0.04282973,  0.02318067],
  [-0.04131923,  0.02480407]],

 [[-0.03877186,  0.0243939 ],
  [-0.04173752,  0.02552123],
  [-0.04168687,  0.02385954],
  [-0.04201929,  0.02454825]]]))

            print(out.shape)
            print(out)
            assert False
class MyWord2Vec:
    def __init__(self, args):

        logging.basicConfig(level=logging.DEBUG, format="%(asctime)s : %(levelname)s : %(message)s")
        self._logger = logging.getLogger(__name__)

        self._logger.info("Initializing Model...")
        self._logger.info("Reading Args...")

        self._args = args
        self._lr = self._args.lr
        self._data_index = 0

        self._context_tensor_size = 0
        self._sampled_tensor_size = 0

    def read_data(self):

        self._logger.info("Reading meta data...")

        self._reader = DataReader(self._logger)

        (
            self._vocab,
            self._vocab_size,
            self._dictionary,
            self._reverse_dictionary,
            self._unigrams,
            self._arts_srcs,
            self._srcs_ents,
            self._ents_srcs,
        ) = self._reader.read_meta_files(self._args.data)

        with open(self._args.output + "-labels-dict.pkl", "wb") as f:
            cPickle.dump(self._reverse_dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL)

        with open(self._args.output + "-vocab-dict.pkl", "wb") as f:
            cPickle.dump(self._dictionary, f, protocol=cPickle.HIGHEST_PROTOCOL)

        self._number_of_srcs = len(set(self._srcs_ents.keys()))

        self._sample_dist()

    def _load_model(self, file_path):
        with open(file_path, "rb") as f:
            embeddings = cPickle.load(f)
        return embeddings

    def _save_model(self, file_path, embeddings):
        with open(file_path, "wb") as f:
            cPickle.dump(embeddings, f, protocol=cPickle.HIGHEST_PROTOCOL)

    def _sample_dist(self):
        freq = np.power(self._unigrams / np.sum(self._unigrams), 0.75)  # unigrams ^ 3/4
        self._dist = freq * (1 / np.sum(freq))  # normalize probabs

    def _get_samples(self, size):
        samples = np.random.choice(range(self._vocab_size), size, p=self._dist)
        return samples

    def _plot(self, title, embeddings):

        self._logger.debug("Plotting...")

        pca = PCA(n_components=2)
        pca.fit(embeddings)
        low_dim_embs = pca.transform(embeddings)
        labels = [self._reverse_dictionary[key] for key in xrange(self._vocab_size)]

        for label, x, y in zip(labels, low_dim_embs[:, 0], low_dim_embs[:, 1]):
            plt.plot(x, y, "x")

            if title != "final":
                plt.annotate(label, xy=(x, y), fontsize="xx-small")
            else:
                plt.annotate(label, xy=(x, y))

        if title is "final":
            plt.show()
        else:
            file = "fig-%s.eps" % title
            plt.savefig(file, format="eps", dpi=1200)

        plt.clf()

    def _build_graph(self):

        self._logger.info("Building tf graph...")

        self.graph = tf.Graph()
        with self.graph.as_default():

            self.make_vars()
            self.build_expr()
            self.optimize()

    def make_vars(self):
        init_width = 0.5 / self._args.emb_size

        # Shared variables holding input and output embeddings
        self.inp_embeddings = tf.Variable(
            tf.random_uniform([self._vocab_size, self._args.emb_size], -init_width, init_width)
        )

        self.out_embeddings = tf.Variable(
            tf.random_uniform([self._vocab_size, self._args.emb_size], -init_width, init_width)
        )

    def build_expr(self):

        self.inp_ctx = tf.placeholder(tf.int32, shape=(None))
        self.out_ctx = tf.placeholder(tf.int32, shape=(None))

        self.inp_neg = tf.placeholder(tf.int32, shape=(None))
        self.out_neg = tf.placeholder(tf.int32, shape=(None))

        self.out_ents = tf.placeholder(tf.int32, shape=(None))
        self.other_ents = tf.placeholder(tf.int32, shape=(None))

        ctx_batch_size = tf.shape(self.inp_ctx)[0]
        neg_batch_size = tf.shape(self.out_ctx)[0]
        ents_constant = tf.shape(self.out_ents)[0]

        src_constnt = tf.constant(self._number_of_srcs, dtype=tf.float32)

        # embedding lookups to get vectors of specified indices (by placeholders)
        embed_inp_ctx = tf.nn.embedding_lookup(self.inp_embeddings, self.inp_ctx)
        embed_out_ctx = tf.nn.embedding_lookup(self.out_embeddings, self.out_ctx)

        embed_inp_neg = tf.nn.embedding_lookup(self.inp_embeddings, self.inp_neg)
        embed_out_neg = tf.nn.embedding_lookup(self.out_embeddings, self.out_neg)

        embed_entities = tf.nn.embedding_lookup(self.out_embeddings, self.out_ents)
        embed_other_entities = tf.nn.embedding_lookup(self.out_embeddings, self.other_ents)

        dot_ctx = tf.mul(embed_inp_ctx, embed_out_ctx)
        sum_ctx = tf.reduce_sum(dot_ctx, 1)
        ctx_expr = tf.log(tf.sigmoid(sum_ctx)) / tf.cast(ctx_batch_size, tf.float32)

        dot_neg = tf.mul(embed_inp_neg, embed_out_neg)
        sum_neg = tf.reduce_sum(dot_neg, 1)
        neg_expr = tf.log(tf.sigmoid(-sum_neg)) / tf.cast(neg_batch_size, tf.float32)

        avg_ents = tf.div(tf.reduce_sum(embed_other_entities, 1), src_constnt)
        ents_diff = tf.square(tf.sub(embed_entities, avg_ents))
        reg_expr = self._args.regularizer * tf.reduce_sum(ents_diff) / tf.cast(ents_constant, tf.float32)

        self.loss = tf.reduce_sum(ctx_expr) + tf.reduce_sum(neg_expr) - reg_expr

    def optimize(self):
        optimizer = tf.train.GradientDescentOptimizer(self._lr)
        self.train = optimizer.minimize(-self.loss, gate_gradients=optimizer.GATE_NONE)

    def lr_decay(self):
        decay_factor = 10.0 * (5.0 / float(self._args.epochs))
        lr = np.maximum(0.0001, self._lr / decay_factor)
        self._lr = round(lr, 4)

    def _ents_matrices(self):

        self._logger.info("Preparing named entites for this source")

        # get political entities
        source_entities = np.array(self._srcs_ents[self._current_source])

        corresponding_ents = list()
        padding_index = self._dictionary["UNK"]
        # get corresponding entities and replace tokens by ids
        for ent in source_entities:

            base_ent = ent.split("_", -1)[0]
            temp = np.array(self._ents_srcs[base_ent])
            """
			TODO:
			Remve entity from its correspondings list
			"""
            for curr_ent in temp:
                temp[temp == curr_ent] = self._dictionary[curr_ent]

            temp = temp.astype(int).tolist()
            temp += [padding_index] * (self._number_of_srcs - len(temp))
            corresponding_ents.append(temp)

            # replace entities' tokens by ids
        source_ents_ids = source_entities
        for ent in source_ents_ids:
            source_ents_ids[source_ents_ids == ent] = self._dictionary[ent]

        source_ents_ids = source_ents_ids.astype(int)

        self._current_entities = source_ents_ids
        self._corresponding_ents = corresponding_ents

    def generate_batch(self):

        context_words = []
        sampled_words = []

        # get current batch, curr_index: curr_index + batch_size
        current_data_batch = self._data[self._data_index : self._data_index + self._args.batch_size]
        self._data_index += self._args.batch_size % self._data_size

        # add extra UNKs for padding context windows
        padding_index = self._dictionary["UNK"]
        lpadded = (
            self._args.window // 2 * [padding_index] + current_data_batch + self._args.window // 2 * [padding_index]
        )

        for idx, word in enumerate(current_data_batch):

            context = lpadded[idx : (idx + self._args.window)]
            samples = self._get_samples(self._args.samples)

            context_words += zip([word] * len(context), context)
            sampled_words += zip([word] * len(samples), samples)

        inp_ctx, out_ctx = zip(*context_words)
        inp_neg, out_neg = zip(*sampled_words)

        feed_dict = {
            self.out_ents: self._current_entities,
            self.other_ents: self._corresponding_ents,
            self.inp_ctx: inp_ctx,
            self.out_ctx: out_ctx,
            self.inp_neg: inp_neg,
            self.out_neg: out_neg,
        }

        return feed_dict

    def _prepare_file(self, file_path):
        data = np.array(self._reader.read_file(file_path, self._dictionary))

        self._data = data.astype(int).tolist()

        self._data_size = len(data)

    def train(self):

        if os.path.exists(self._args.output):
            embeddings = self._load_model(self._args.output)
            self._plot("final", embeddings)
            return

        self._build_graph()
        self._logger.info("Starting training ...")

        with tf.Session(graph=self.graph) as sess:

            tf.initialize_all_variables().run()
            first_start = time.time()
            start = time.time()

            for epoch in xrange(1, self._args.epochs + 1):

                self._logger.info(
                    "[*] training, epoch num: %d, out of %d with learning rate: %f"
                    % (epoch, self._args.epochs, self._lr)
                )

                total_batches = 0
                batches_so_far = 0

                avg = 0

                for file_path in self._arts_srcs:

                    self._current_source = self._arts_srcs[file_path]
                    self._ents_matrices()

                    self._logger.info("Reading file %s" % file_path)
                    self._prepare_file(file_path)

                    file_batches = self._data_size / self._args.batch_size
                    check_point = file_batches / 4
                    total_batches += file_batches

                    for batch in xrange(file_batches):
                        batches_so_far += 1
                        feed_dict = self.generate_batch()
                        cost, _ = sess.run([self.loss, self.train], feed_dict=feed_dict)

                        # if math.isnan(cost) or math.isinf(cost):
                        # 	self._logger.info('[*] Encountered NaN or Inf, stopping training')
                        # 	final_embeddings = prev_emb.eval()
                        # 	break

                        avg += cost

                        # if batch % check_point == 0 and batch != 0:
                        self._logger.info(
                            "\t[*][*] batch %s out of %s, avg cost=%s, time so far: %ds"
                            % (batch, file_batches, avg / batches_so_far, int(time.time() - start))
                        )

                    self._data_index = 0
                    self._logger.info(
                        "[*] Done file %s, avg cost=%s, time taken: %ds "
                        % (file_path, avg / file_batches, int(time.time() - start))
                    )

                avg /= total_batches
                self._logger.info(
                    "[*] Done epoch %s out of %s, avg cost=%s, time taken: %ds "
                    % (epoch, self._args.epochs, avg, int(time.time() - start))
                )

                avg = 0
                self.lr_decay()
                print "________________________________________________\n"

            self._logger.info("[*] Total training time: %ds" % int(time.time() - first_start))
            final_embeddings = self.out_embeddings.eval()
        self._save_model(self._args.output, final_embeddings)
        self._plot("final", final_embeddings)
    def __init__(self, visualize_best=False, train_subj=6, test_subj=6):
        output_raw_scores = False

        # compare = True
        self.visualize_best = visualize_best

        self.tf_listener = tf.TransformListener()
        self.train_subj = train_subj
        self.test_subj = test_subj
        print 'I will use data that was trained on subject ', self.train_subj
        print 'I will test on data from subject ', self.test_subj

        self.task = 'shaving' # options are: bathing, brushing, feeding, shaving, scratching_upper_arm/forearm/thigh/chest/knee
        self.model = 'chair'  # options are: 'chair', 'bed', 'autobed'

        pos_clust = 2
        ori_clust = 2
        self.mc_simulation_number = None

        self.visualize = False
        data_start = 0
        data_finish = 'end '  # 2000  # 4000 #'end'

        rospack = rospkg.RosPack()
        self.pkg_path = rospack.get_path('hrl_base_selection')
        print 'Loading scores.'
        self.loaded_scores = self.load_task(self.task, self.model, self.train_subj)
        if self.loaded_scores is None:
            print 'The scores do not exist. Must generate scores! This may take a long time...'
            self.generate_scores(data_start, data_finish, pos_clust, ori_clust)
            print 'Scores generated. I will now continue.'
            print 'Now loading the scores I just generated'
            self.loaded_scores = self.load_task(self.task, self.model, self.train_subj)
        if self.loaded_scores is None:
            print 'The scores still do not exist. This is bad. Fixes needed in code.'
            return
        headx = 0
        heady = 0
        self.scores = self.loaded_scores[headx, heady]
        if output_raw_scores:
            self.output_scores()
        subject = ''.join(['sub', str(self.test_subj), '_shaver'])
        print 'Reading in raw data from the task.'
        read_task_data = DataReader_Task(self.task, self.model)
        raw_data, raw_num, raw_reference, self.raw_reference_options = read_task_data.reset_goals()
        read_data = DataReader(subject=subject, data_start=data_start, reference_options=self.raw_reference_options,
                               data_finish=data_finish, model=self.model, task=self.task, tf_listener=self.tf_listener)

        # raw_data = read_data.get_raw_data()
        print 'Raw data is ready!'
        self.goal_data = read_data.generate_output_goals(test_goals=raw_data, test_number=raw_num, test_reference=raw_reference)
        # print 'Setting up openrave'
        # self.setup_openrave()
        # print 'I will now pick base locations to evaluate. They will share the same reachability score, but will have' \
        #       ' differing manipulability scores.'
        # print 'before sorting:'
        # for i in xrange(10):
        #     print self.scores[i]
        self.scores = np.array(sorted(self.scores, key=lambda t: (t[1][1], t[1][2]), reverse=True))
        # print 'after sorting:'
        # for i in xrange(10):
        #     print self.scores[i]
        self.best_base = self.scores[0]
        if self.best_base[1][1] == 0:
            print 'There are no base locations with reachable goals. Something went wrong in the scoring or the setup'
        print 'The best base location is: \n', self.best_base

        if visualize_best:
            read_data.pub_rviz()
            self.visualize_base_config(self.best_base, self.goal_data, self.raw_reference_options)
Beispiel #42
0
class APAProject(object):

    def __init__(self):

        self.data_reader = DataReader('data/training_data/training.data', 'data/stopwords/stopwords.txt', True, 1000)
        self.perceptron = Perceptron()
        self.softmax = Softmax()
        # Let's create 5 classifiers
        universe_size = len(self.data_reader.universe)
        self.perceptron_classifiers = [np.zeros((universe_size + 1)) for i in range(5)]
        self.softmax_classifier = np.ones((5, universe_size + 1))

    def file_to_data_set(self, file):
        data_set = []
        with open(file) as data:

            for line in data:
                _, score, sentence = line.split('|')
                score = float(score)

                # Calculating train target:
                # 0 if 0 < score <= 0.2, 1 if 0.2 < score <= 0.4, etc...
                class_number = math.floor(score * 5)
                sentence_vector = self.data_reader.get_sentence_coordinates(sentence)
                data_set.append((sentence_vector, class_number))
        return data_set

    def train_perceptron(self):
        start_time = time.time()

        print "Starting training session ..."

        # We need to read data from datasmall and train the perceptron
        training_data_set = self.file_to_data_set('data/training_data/training.data')

        PERIODS = 5

        for i in range(PERIODS):
            # For each period, reshuffle
            random.shuffle(training_data_set)
            # We train every classfier
            for (classifier_index, classifier) in enumerate(self.perceptron_classifiers):
                self.perceptron_classifiers[classifier_index], updates = self.perceptron.train_epoch(training_data_set, classifier_index, classifier)
            self.test_perceptron_multiclass()

        training_end_time = time.time()
        training_duration = training_end_time - start_time
        print "Training session finished: duration %s seconds" % training_duration

    def test_perceptron(self):
        print "Starting testing session..."

        test_data_set = self.file_to_data_set('data/test_data/test.data')

        for (classifier_index, classifier) in enumerate(self.perceptron_classifiers):
            error_count, success_count = self.perceptron.test_classifier(test_data_set, classifier, classifier_index)
            print "Classifier %s just finished. %s%% results are good" % ((classifier_index + 1), success_count * 100 / (success_count + error_count))

    def test_perceptron_multiclass(self):
        print "Starting testing session..."

        test_data_set = self.file_to_data_set('data/test_data/test.data')

        success_count = 0
        error_count = 0

        for (sentence_vector, class_number) in test_data_set:
            results_classifiers = []
            test_class = -1
            for (classifier_index, classifier) in enumerate(self.perceptron_classifiers):
                results_classifiers.append(np.dot(classifier, sentence_vector))
            if results_classifiers.index(max(results_classifiers)) == class_number:
                success_count += 1
            else:
                error_count += 1

        print "Classifier just finished. %s/%s ~= %s%% results are good" % (success_count, (error_count + success_count), success_count * 100 / (success_count + error_count))

    def train_softmax(self):
        start_time = time.time()
        print "Starting softmax training session..."

        # We need to read data from datasmall and train the perceptron
        training_data_set = self.file_to_data_set('data/training_data/training.data')

        PERIODS = 10

        for i in range(PERIODS):
            random.shuffle(training_data_set)
            # On apprend PERIODS fois et a chaque passage on test le classifier pour etudier l'evolution
            # Rappel : self.softmax_classifier = np.ones((5, universe_size))
            self.softmax_classifier = self.softmax.train_epoch(self.softmax_classifier, training_data_set)
            self.test_softmax()

        training_end_time = time.time()
        training_duration = training_end_time - start_time
        print "Training session finished: duration %s seconds" % training_duration

    def test_softmax(self):
        print "Starting softmax testing session..."

        test_data_set = self.file_to_data_set('data/test_data/test.data')
        #test_data_set = self.file_to_data_set('data/training_data/training.data')

        error_count, success_count = self.softmax.test_classifier(self.softmax_classifier, test_data_set)
        print "Classifier just finished. %s/%s ~= %s%% results are good" % (success_count, (error_count + success_count), success_count * 100 / (success_count + error_count))
Beispiel #43
0
def main(_):
    ''' Trains model from data '''
    print("we in main")
    print(sys.argv[2])
    print(FLAGS)
    if not os.path.exists(FLAGS.train_dir):
        os.mkdir(FLAGS.train_dir)
        print('Created training directory', FLAGS.train_dir)
    
    word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = \
        load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS)
    
    train_reader = DataReader(word_tensors['train'], char_tensors['train'],
                              FLAGS.batch_size, FLAGS.num_unroll_steps)

    valid_reader = DataReader(word_tensors['valid'], char_tensors['valid'],
                              FLAGS.batch_size, FLAGS.num_unroll_steps)

    test_reader = DataReader(word_tensors['test'], char_tensors['test'],
                              FLAGS.batch_size, FLAGS.num_unroll_steps)
    
    print('initialized all dataset readers')
    
    with tf.Graph().as_default(), tf.Session() as session:

        # tensorflow seed must be inside graph        
        tf.set_random_seed(FLAGS.seed)
        np.random.seed(seed=FLAGS.seed)

        ''' build training graph '''
        initializer = tf.random_uniform_initializer(-FLAGS.param_init, FLAGS.param_init)
        with tf.variable_scope("Model", initializer=initializer):
            train_model = model.inference_graph(
                    char_vocab_size=char_vocab.size,
                    word_vocab_size=word_vocab.size,
                    char_embed_size=FLAGS.char_embed_size,
                    batch_size=FLAGS.batch_size,
                    num_highway_layers=FLAGS.highway_layers,
                    num_rnn_layers=FLAGS.rnn_layers,
                    rnn_size=FLAGS.rnn_size,
                    max_word_length=max_word_length,
                    kernels=eval(FLAGS.kernels),
                    kernel_features=eval(FLAGS.kernel_features),
                    num_unroll_steps=FLAGS.num_unroll_steps,
                    dropout=FLAGS.dropout)
            train_model.update(model.loss_graph(train_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps))
            
            # scaling loss by FLAGS.num_unroll_steps effectively scales gradients by the same factor.
            # we need it to reproduce how the original Torch code optimizes. Without this, our gradients will be
            # much smaller (i.e. 35 times smaller) and to get system to learn we'd have to scale learning rate and max_grad_norm appropriately.
            # Thus, scaling gradients so that this trainer is exactly compatible with the original
            train_model.update(model.training_graph(train_model.loss * FLAGS.num_unroll_steps, FLAGS.learning_rate, FLAGS.max_grad_norm))

        # create saver before creating more graph nodes, so that we do not save any vars defined below      
        saver = tf.train.Saver(max_to_keep=50)

        ''' build graph for validation and testing (shares parameters with the training graph!) '''
        with tf.variable_scope("Model", reuse=True):
            valid_model = model.inference_graph(
                    char_vocab_size=char_vocab.size,
                    word_vocab_size=word_vocab.size,
                    char_embed_size=FLAGS.char_embed_size,
                    batch_size=FLAGS.batch_size,
                    num_highway_layers=FLAGS.highway_layers,
                    num_rnn_layers=FLAGS.rnn_layers,
                    rnn_size=FLAGS.rnn_size,
                    max_word_length=max_word_length,
                    kernels=eval(FLAGS.kernels),
                    kernel_features=eval(FLAGS.kernel_features),
                    num_unroll_steps=FLAGS.num_unroll_steps,
                    dropout=0.0)
            valid_model.update(model.loss_graph(valid_model.logits, FLAGS.batch_size, FLAGS.num_unroll_steps))

        with tf.variable_scope("Model", reuse=True):
            test_model = model.inference_graph(
                    char_vocab_size=char_vocab.size,
                    word_vocab_size=word_vocab.size,
                    char_embed_size=FLAGS.char_embed_size,
                    batch_size=1,
                    num_highway_layers=FLAGS.highway_layers,
                    num_rnn_layers=FLAGS.rnn_layers,
                    rnn_size=FLAGS.rnn_size,
                    max_word_length=max_word_length,
                    kernels=eval(FLAGS.kernels),
                    kernel_features=eval(FLAGS.kernel_features),
                    num_unroll_steps=1,
                    dropout=0.0)
            test_model.update(model.loss_graph(test_model.logits, 1, 1))

        if FLAGS.load_model:
            saver.restore(session, FLAGS.load_model)
            print('Loaded model from', FLAGS.load_model, 'saved at global step', train_model.global_step.eval())
        else:
            tf.initialize_all_variables().run()
            print('Created and initialized fresh model. Size:', model.model_size())
        
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, graph=session.graph)

        ''' take learning rate from CLI, not from saved graph '''
        session.run(
            tf.assign(train_model.learning_rate, FLAGS.learning_rate),
        )
        
        def clear_char_embedding_padding():
            char_embedding = session.run(train_model.char_embedding)
            char_embedding[0,:] = 0.0
            session.run(tf.assign(train_model.char_embedding, char_embedding))
            char_embedding = session.run(train_model.char_embedding)
        
        clear_char_embedding_padding()

        run_test2(session, test_model, train_reader)
        #exit(1)

        ''' training starts here '''
        best_valid_loss = None
        rnn_state = session.run(train_model.initial_rnn_state)
        for epoch in range(FLAGS.max_epochs):

            avg_train_loss = 0.0
            count = 0
            for x, y in train_reader.iter():
                count += 1        
                start_time = time.time()
                print (x)
                exit(1)
                loss, _, rnn_state, gradient_norm, step = session.run([
                    train_model.loss,
                    train_model.train_op, 
                    train_model.final_rnn_state,
                    train_model.global_norm, 
                    train_model.global_step,
                ], {
                    train_model.input  : x,
                    train_model.targets: y,
                    train_model.initial_rnn_state: rnn_state
                })

                clear_char_embedding_padding()
                
                avg_train_loss += 0.05 * (loss - avg_train_loss)
        
                time_elapsed = time.time() - start_time
                
                if count % FLAGS.print_every == 0:
                    print('%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step, 
                                                            epoch, count, 
                                                            train_reader.length, 
                                                            loss, np.exp(loss),
                                                            time_elapsed,
                                                            gradient_norm))

            # epoch done: time to evaluate  
            avg_valid_loss = 0.0
            count = 0 
            rnn_state = session.run(valid_model.initial_rnn_state)
            for x, y in valid_reader.iter():
                count += 1        
                start_time = time.time()
        
                loss, rnn_state = session.run([
                    valid_model.loss, 
                    valid_model.final_rnn_state
                ], {
                    valid_model.input  : x,
                    valid_model.targets: y,
                    valid_model.initial_rnn_state: rnn_state,
                })
                
                if count % FLAGS.print_every == 0:
                    print("\t> validation loss = %6.8f, perplexity = %6.8f" % (loss, np.exp(loss)))
                avg_valid_loss += loss / valid_reader.length

            print("at the end of epoch:", epoch)            
            print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss)))
            print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss)))

            save_as = '%s/epoch%03d_%.4f.model' % (FLAGS.train_dir, epoch, avg_valid_loss)
            saver.save(session, save_as)
            print('Saved model', save_as)

            ''' write out summary events '''
            summary = tf.Summary(value=[
                tf.Summary.Value(tag="train_loss", simple_value=avg_train_loss),
                tf.Summary.Value(tag="valid_loss", simple_value=avg_valid_loss)
            ])
            summary_writer.add_summary(summary, step)
            
            ''' decide if need to decay learning rate '''
            if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp(best_valid_loss) - FLAGS.decay_when:
                print('** validation perplexity did not improve enough, decay learning rate')
                current_learning_rate = session.run(train_model.learning_rate)
                print('learning rate was:', current_learning_rate)
                current_learning_rate *= FLAGS.learning_rate_decay
                if current_learning_rate < 1.e-5:
                    print('learning rate too small - stopping now')
                    break

                session.run(train_model.learning_rate.assign(current_learning_rate))
                print('new learning rate is:', current_learning_rate)
            else:
                best_valid_loss = avg_valid_loss

        run_test2(session, test_model, train_reader)
        print ("AGAIN")
        run_test2(session, test_model, train_reader)
 def setUp(self):
    DataReader.createHostsFromFile()
    DataReader.createInstancesFromFile()
def main(_):
    ''' Loads trained model and evaluates it on test split '''

    if FLAGS.load_model is None:
        print('Please specify checkpoint file to load model from')
        return -1
    
    if not os.path.exists(FLAGS.load_model):
        print('Checkpoint file not found', FLAGS.load_model)
        return -1
    
    word_vocab, char_vocab, word_tensors, char_tensors, max_word_length = load_data(FLAGS.data_dir, FLAGS.max_word_length, eos=FLAGS.EOS)

    test_reader = DataReader(word_tensors['test'], char_tensors['test'], FLAGS.batch_size, FLAGS.num_unroll_steps)
    
    print('initialized test dataset reader')
    
    with tf.Graph().as_default(), tf.Session() as session:

        # tensorflow seed must be inside graph        
        tf.set_random_seed(FLAGS.seed)
        np.random.seed(seed=FLAGS.seed)

        ''' build inference graph '''
        with tf.variable_scope("Model"):
            m = model.inference_graph(
                    char_vocab_size=char_vocab.size,
                    word_vocab_size=word_vocab.size,
                    char_embed_size=FLAGS.char_embed_size,
                    batch_size=FLAGS.batch_size,
                    num_highway_layers=FLAGS.highway_layers,
                    num_rnn_layers=FLAGS.rnn_layers,
                    rnn_size=FLAGS.rnn_size,
                    max_word_length=max_word_length,
                    kernels=eval(FLAGS.kernels),
                    kernel_features=eval(FLAGS.kernel_features),
                    num_unroll_steps=FLAGS.num_unroll_steps,
                    dropout=0)
            m.update(model.loss_graph(m.logits, FLAGS.batch_size, FLAGS.num_unroll_steps))

            global_step = tf.Variable(0, dtype=tf.int32, name='global_step')

        saver = tf.train.Saver()
        saver.restore(session, FLAGS.load_model)
        print('Loaded model from', FLAGS.load_model, 'saved at global step', global_step.eval())

        ''' training starts here '''
        rnn_state = session.run(m.initial_rnn_state)
        count = 0
        avg_loss = 0
        start_time = time.time()
        for x, y in test_reader.iter():
            count += 1
            loss, rnn_state = session.run([
                m.loss,
                m.final_rnn_state
            ], {
                m.input  : x,
                m.targets: y,
                m.initial_rnn_state: rnn_state
            })
            
            avg_loss += loss
        
        avg_loss /= count
        time_elapsed = time.time() - start_time

        print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss)))
        print("test samples:", count*FLAGS.batch_size, "time elapsed:", time_elapsed, "time per one batch:", time_elapsed/count)
 def testCreateHostsFromFile(self):
      DataReader.createHostsFromFile(TestDataReader.TEST_HOSTS_FILE)
      self.assertEquals(len(DataReader.hosts), 8)
 def testCreateHostsFileError(self):
      DataReader.createHostsFromFile(TestDataReader.TEST_HOSTS_FILE)
      self.assertRaises(Exception)
    def testFindHostByID(self):
        host = DataReader.findHostByID(2)
        self.assertIsNotNone(host)

        host = DataReader.findHostByID(-1)
        self.assertIsNone(host)
def initData(hfile, ifile):
    setup()
    logging.info('INITIALIZING DATA: Reading host and instance files')
    DataReader.createHostsFromFile(hfile)
    DataReader.createInstancesFromFile(ifile)
 def testCreateInstancesFromFile(self):
     DataReader.createInstancesFromFile(TestDataReader.TEST_INSTANCES_FILE)
     self.assertEquals(len(DataReader.instances), 15)
     for i in DataReader.instances:
         self.assertIsNotNone(i.host)
    # params for SVC
    kernel = 'rbf'
    C = 1
    gamma = 10

    # Choose a classifier
    alg = RandomForestClassifier(n_estimators=number_of_trees, criterion='entropy', max_features='log2')
    # alg = SVC(kernel=kernel, C=C, gamma=gamma)
    # alg = SVR()
    # alg = LinearSVR()

    visualize_xyz_example = False
    visualize_interpolated = False
    training_enabled = True

    data_reader = DataReader(xyz)
    print "Parsing data..."
    data, labels = data_reader.parse(fname)
    labels = np.array(labels)
    labels[labels < 1] = -1

    if (visualize_xyz_example):
        timestamps = np.arange(train_frame_start,
            train_frame_end + train_sparseness, train_sparseness)
        visualize_count = 3
        visualized = 0
        for i in xrange(0, len(labels)):
            if (labels[i] and visualized < visualize_count):
                plt.figure(figsize=(20, 10))
                plt.plot(timestamps, data[i][1], 'r')
                plt.plot(timestamps, data[i][2], 'g')