コード例 #1
0
def get_iris_data(file_in):
    """
    Fetch the UCI data set on physical characteristics of Iris species.
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-2]  #last two lines are blank

    x_headers = [
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    for col in output:
        print(col)
        if col not in cat_variables:
            output[col] = [float(x) for x in output[col]]

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
コード例 #2
0
def get_vote_data(file_in):
    """
    Fetch and clean the UCI data set on US Representative vote records
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'target', 'handicapped-infants', 'water-project-cost-sharing',
        'adoption-of-the-budget-resolution', 'physician-fee-freeze',
        'el-salvador-aid', 'religious-groups-in-schools',
        'anti-satellite-test-ban', 'aid-to-nicaraguan-contras', 'mx-missile',
        'immigration', 'synfuels-corporation-cutback', 'education-spending',
        'superfund-right-to-sue', 'crime', 'duty-free-exports',
        'export-administration-act-south-africa'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
コード例 #3
0
def get_glass_data(file_in):
    """
    Fetch the UCI data set on age of chemical characteristics of glass.
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'id', 'RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    for col in output:
        print(col)
        if col not in cat_variables:
            output[col] = [float(x) for x in output[col]]

    output = output.drop(['id'], axis=1)

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
コード例 #4
0
def get_cancer_data(file_in):
    """
    Fetch the UCI data set on breast cancer characteristics 
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'id', 'clump_thickness', 'unif_cell_size', 'unif_cell_shape',
        'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
        'bland_chrmatin', 'normal_nucleoli', 'mitoses', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = output.drop(['id'], axis=1)

    output = helpers.replace_missing_mode(output)

    for col in output:
        if col not in cat_variables:
            output[col] = [float(x) for x in output[col]]

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
コード例 #5
0
def get_soy_data(file_in):
    """
    Fetch the UCI data set on diseases of soybean samples.
    """

    data_text = open(file_in, 'r').read()
    data_rows = data_text.split('\n')
    data_rows = data_rows[0:-1]  #last line is blank

    x_headers = [
        'date', 'plant-stand', 'precip', 'temp', 'hail', 'crop-hist',
        'area-damaged', 'severity', 'seed-tmt', 'germination', 'plant-growth',
        'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size',
        'leaf-shread', 'leaf-malf', 'leaf-mild', 'stem', 'lodging',
        'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external decay',
        'mycelium', 'int-discolor', 'sclerotia', 'fruit-pods', 'fruit spots',
        'seed', 'mold-growth', 'seed-discolor', 'seed-size', 'shriveling',
        'roots', 'target'
    ]

    cat_variables = ['target']

    data_all = [row.split(',') for row in data_rows]

    output = pd.DataFrame(data_all, columns=x_headers)

    output = helpers.replace_missing_mode(output)

    output = helpers.one_hot_encode(output, exclude=[])
    output = helpers.normalize(output)

    return (output)
コード例 #6
0
def learn_and_score(neurons, learn_rate, k, epochs, batch_size):
    database = load_iris()

    x = database.data
    y = database.target

    input = x
    target = helpers.one_hot_encode(y)

    mlp = models.MultiLayerPerceptron(input.shape[1],
                                      target.shape[1],
                                      neurons,
                                      learn_rate=learn_rate)

    return helpers.cross_val_score(mlp, input, target, k, epochs, batch_size)
コード例 #7
0
def train(
        input_file="clean_train.csv",
        text_col="question_text",
        label_col="target",
        valid_ratio=0.2,
        max_sentence_length=91,
        sample_percent=1,
        class_weights=None,
        cell_type="gru",
        embedding="word2vec",
        embedding_path="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
        embedding_dim=300,
        rnn_layers=3,
        hidden_size=128,
        one_minus_dropout=0.5,
        l2_reg=3.0,
        batch_size=32,
        epochs=5,
        learning_rate=1e-3,
        allow_soft_placement=True,
        log_device_placement=False,
        display_every=10,
        evaluate_every=100,
        checkpoint_every=100,
        num_checkpoints=5):
    # Load and split data
    print("Loading data..")
    X, Y = read_data(input_file,
                     text_col,
                     label_col,
                     sample_percent=sample_percent)

    # Create a vocanulary process
    # Its job is to assign each unique word an integer and then our sentences replace each word it's corresponding integer.
    # These mappings are later used again to substitue each word with its embedding
    # This method also trims or adds trailing zeros to padd and fit each sentence to a specific length
    print("Setting up vocabulary..")
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        max_sentence_length)
    X = np.array(list(vocab_processor.fit_transform(X)))
    print("Vocabulary Size: ", len(vocab_processor.vocabulary_))
    num_classes = len(Y[0])

    # split in to train and validation
    X, Y, x_val, y_val = split_data(X, Y, valid_ratio)

    # initialize tensorflow config
    print("Initializing tensorflow session..")
    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            print("Initializing our RNN:")
            print("\nseq_length : ", X.shape[1], "\nnum_classes : ",
                  Y.shape[1], "\nvocab_size : ",
                  len(vocab_processor.vocabulary_), "\nembedding_size : ",
                  embedding_dim, "\ncell_type : ", cell_type,
                  "\nhidden_size : ", hidden_size, "\nl2 : ", l2_reg,
                  "\nclass_weights :  ", class_weights, "\nbatch_size : ",
                  batch_size, "\nrnn_layers :  ", rnn_layers)
            # Initiazlie our RNN
            rnn = RNN(seq_length=X.shape[1],
                      num_classes=Y.shape[1],
                      vocab_size=len(vocab_processor.vocabulary_),
                      embedding_size=embedding_dim,
                      cell_type=cell_type,
                      hidden_size=hidden_size,
                      l2=l2_reg,
                      class_weights=class_weights,
                      batch_size=batch_size,
                      rnn_layers=rnn_layers)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            train_op = tf.train.AdamOptimizer(learning_rate).minimize(
                rnn.loss, global_step=global_step)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", rnn.loss)
            acc_summary = tf.summary.scalar("accuracy", rnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Validation summaries
            val_summary_op = tf.summary.merge([loss_summary, acc_summary])
            val_summary_dir = os.path.join(out_dir, "summaries", "val")
            val_summary_writer = tf.summary.FileWriter(val_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")

            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "text_vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            # Initializing pretrained embeddings if embedding flag is up
            if embedding:
                # initial matrix with random uniform
                initW = np.random.uniform(
                    -0.25, 0.25,
                    (len(vocab_processor.vocabulary_), embedding_dim))

                # In case of glove, loading embedings is pretty easy
                # Just read each line, first word is the word
                # and evey thing else on the line is a vector embedding for that vector
                if "glove" in embedding:
                    with open(embedding_path, "r", encoding="utf8") as f:
                        for line in f:
                            first_word = line.partition(' ')[0]
                            rest = line[line.index(' ') + 1:]
                            # Find if word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(first_word)
                            if idx != 0:
                                # If yes then substitue the glove embedding for it instead of the random one
                                initW[idx] = np.fromstring(rest,
                                                           dtype='float32',
                                                           sep=" ")
                # In case of word2vec, we are given a bin file
                elif "word2vec" in embedding:
                    with open(embedding_path, "rb") as f:
                        # First line is header containing information about number of records and size of one record
                        header = f.readline()
                        vocab_size, layer1_size = map(int, header.split())
                        # Then, number of bytes in each record  = (size of a float) * size of one record
                        binary_len = np.dtype('float32').itemsize * layer1_size
                        # for each record
                        for line in range(vocab_size):
                            word = []
                            while True:
                                # Keep reading a charachter
                                ch = f.read(1).decode('latin-1')
                                if ch == ' ':
                                    # until you find a space, then the first word is complete
                                    word = ''.join(word)
                                    break
                                if ch != '\n':
                                    word.append(ch)
                            # Try to find that first word in our vocabulary
                            idx = vocab_processor.vocabulary_.get(word)
                            if idx != 0:
                                # if found, add substitue the corespoding embedding vector with the random vector
                                initW[idx] = np.fromstring(f.read(binary_len),
                                                           dtype='float32')
                            else:
                                f.read(binary_len)

                sess.run(rnn.W_text.assign(initW))
                print("Successful to load ", embedding, "!\n")

            # Once we are done with the embeddings and basic tensorflow settings
            # We now start with actual training routine

            # Generate batches
            itr = batch_iterator(X, Y, batch_size, epochs)
            # For each batch
            for x_batch, y_batch, start, end in itr:
                # Train
                feed_dict = {
                    rnn.input_text: x_batch,
                    rnn.input_label: y_batch,
                    rnn.keep_prob: one_minus_dropout
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, rnn.loss,
                    rnn.accuracy
                ], feed_dict)
                train_summary_writer.add_summary(summaries, step)

                # Training log display
                if step % display_every == 0:
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: step {}, loss {:g}, acc {:g}".format(
                        time_str, step, loss, accuracy))

                # Evaluation
                if step % evaluate_every == 0:
                    print("\nEvaluation:")
                    total_preds = np.zeros(y_val.shape)
                    itr2 = batch_iterator(x_val,
                                          y_val,
                                          batch_size,
                                          1,
                                          shuffle=False)
                    avg_acc = 0
                    avg_loss = 0
                    steps = 0
                    for x_eval_batch, y_eval_batch, s, e in itr2:
                        feed_dict_val = {
                            rnn.input_text: x_eval_batch,
                            rnn.input_label: y_eval_batch,
                            rnn.keep_prob: 1.0
                        }
                        summaries_val, loss, accuracy, preds = sess.run([
                            val_summary_op, rnn.loss, rnn.accuracy,
                            rnn.predictions
                        ], feed_dict_val)
                        val_summary_writer.add_summary(summaries_val, step)
                        k = np.array([
                            one_hot_encode(num_classes, label)
                            for label in preds
                        ])
                        avg_acc += accuracy
                        avg_loss += loss
                        steps += 1
                        total_preds[s:e] = k
                    cf, f_score = confusion_matrix(y_val, total_preds, 2)
                    avg_acc /= steps
                    avg_loss /= steps
                    time_str = datetime.datetime.now().isoformat()
                    print("{}: loss {:g}, acc {:g}, fscore {:g}\n".format(
                        time_str, avg_loss, avg_acc, f_score))
                    print("Confusion Matrix")
                    print(cf)
                # Model checkpoint
                if step % checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=step)
                    print("Saved model checkpoint to {}\n".format(path))
コード例 #8
0
def evaluate(X,
             colname,
             batch_size,
             checkpoint_dir,
             labels=None,
             allow_soft_placement=True,
             log_device_placement=False):
    text_path = os.path.join(checkpoint_dir, "..", "text_vocab")
    text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(
        text_path)
    X = [str(x) for x in X]
    x_eval = np.array(list(text_vocab_processor.transform(X)))
    if labels is not None:
        classes = len(labels[0])
        y_eval = np.argmax(labels, axis=1)
    else:
        y_eval = None
        classes = None

    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=allow_soft_placement,
            log_device_placement=log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_text = graph.get_operation_by_name("input_text").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/logits").outputs[0]
            # Generate batches for one epoch
            iterator = batch_iterator(x_eval,
                                      y_eval,
                                      batch_size,
                                      1,
                                      shuffle=False)

            # Collect the predictions here
            all_predictions = []
            for item in iterator:
                x = item[0]
                batch_predictions = sess.run(predictions, {
                    input_text: x,
                    dropout_keep_prob: 1.0
                })
                print(batch_predictions.shape)
                print(batch_predictions[0])
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

            all_predictions = [
                one_hot_encode(classes, int(pred)) for pred in all_predictions
            ]
            print("predictions\n", all_predictions)
            if labels is not None:
                c, f = confusion_matrix(labels, all_predictions, classes)
                print("fscore ", f)
                print("confusion_matrix:")
                print(c)
                all_predictions, c, f
            return all_predictions