Example #1
0
def main():
    file_path = 'spam.csv'
    data = load_data(file_path)
    preprocess_data(data)

    X_pyth = []
    y_pyth = []
    for d in data:
        text = d['text']
        entry = (
            features.currency_count(text),
            features.url_count(text),
            features.word_count(text),
            features.longest_numerical_string(text),
            features.average_word_length(text),
            features.num_win_occurences(text),
            features.num_free_occurences(text)
        )

        X_pyth.append(entry)

        if d['category'] == 'spam':
            y_pyth.append(0)
        else:
            y_pyth.append(1)

    X = np.array(X_pyth)
    y = np.array(y_pyth)


    # Randomly shuffle data
    p = np.random.permutation(len(y_pyth))
    X = X[p]
    y = y[p]

    # Split into training and testing datasets
    X_train = X[0:4000]
    y_train = y[0:4000]

    X_test = X[4001:5571]
    y_test = y[4001:5571]

    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)

    prediction = model.predict(X_test)
    accuracy = accuracy_score(y_test, prediction)

    matrix = confusion_matrix(y_test, prediction)
    binary = np.array(matrix)

    fig, ax = plot_confusion_matrix(conf_mat=binary)
    plt.show()

    print(accuracy)
Example #2
0
def main():
    file_path = 'spam.csv'
    data = load_data(file_path)
    preprocess_data(data)

    spams = []
    hams = []
    for d in data:
        if d['category'] == 'spam':
            spams.append(d['text'])
        else:
            hams.append(d['text'])

    plot_function_to_test('Number of "Free" Occurences', hams, spams)
def logistic_regression():
    """build logistic regressor to predict survival label"""
    df, y, X = preprocessing.preprocess_data()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    return y_test, y_pred
Example #4
0
def runSerial(fname, box_number, box_port, parameters):
    ##    fname = file location
    ##    box = what box you are using
    ##    highprob = e.g. 45 or 90
    ##    flippinggamma = e.g. 15
    ##    rewardsize  = 2 or 4
    ##    box = e.g. 2 or 4
    ##    stimulation = 0 or 1
    ##    protocol = highprob, flippinggamma, rewardsize, stimulation
    ##    parameters = protocol e box number alla fine

    #print(fname)
    baud = '115200'  #set frequency of communication, must be in agreement with arduino
    fmode = 'ab'  #Opens a file for appending in binary format, begin the txt raw data

    port = serial.Serial(box_port, baud)

    # open(fname,fmode) as outf: technicality for pythin to write arduino's messages
    outf = open(fname, fmode)

    #open port serial.Serial(addr,baud):
    if port.isOpen() == False:
        port.open()

    print("WAIT FOR IT!!!!\n\n")

    time.sleep(2.5)

    #send parameters to start script:
    for parameter in parameters:
        time.sleep(0.5)
        signal = str(chr(int(parameter))).encode('ascii')
        port.write(signal)
    global stopper  #Boolean array set on false for each box,
    #when is true it stops writing the txt file and close communication with arduino
    stopper[box_number] = False
    #Main loop: as long stopper is false python keep on writing arduino's messages
    while port.isOpen() & ~stopper[box_number]:
        if port.inWaiting() > 0:
            try:
                x = port.readline()
                if b'-666' in x:
                    print("All is well in box %d!!!\n" % box_number)
                outf.write(x)
                outf.flush()

            except:
                print("Error in box %d!!!\n" % box_number)
                outf.write("Error")
                outf.flush()
    #When stopper is true Python exits the main loop and proceed to preprocessing
    # Call preprocessing module to get csv version of dataframe
    dataframe = preprocessing.preprocess_data(
        fname)  #fname is the txt file where we are writing
    csv_fname = fname[:-4] + '.csv'  #change extension to create the name for the csv
    dataframe.to_csv(csv_fname)  #save the preprocess data in a csv file
    print("I'm done in box %d!!!\n" % box_number)
    def predict(self, X_test, verbose=0):

        if not self._ensemble:
            print("You must train the net first")
            return

        X_test, _, _ = preprocess_data(X_test, [],
                                       self._models[0]._nb_classes,
                                       img_rows=self._models[0]._img_rows,
                                       img_cols=self._models[0]._img_cols,
                                       verbose=verbose)
        return self._ensemble.predict_classes([np.asarray(X_test)] *
                                              len(self._models))
Example #6
0
def render_visual(input):
    workweek,weekend = preprocess_data(DATA)

    if input == 'workweek':
        data = workweek
    else:
        data = weekend
    fig = px.scatter_mapbox(data,lat='lat',lon='long',
                        color='Proportion Of Bikes Available',animation_frame='Hour',
                        hover_name='name',color_continuous_scale=px.colors.sequential.Blues,
                        zoom=12)

    fig.update_traces(marker=dict(size=15),
                      selector=dict(mode='markers'))
    return fig
    def evaluate(self, X_test, y_test, verbose=0):

        X_test, y_test, _ = preprocess_data(X_test,
                                            y_test,
                                            self._models[0]._nb_classes,
                                            img_rows=self._models[0]._img_rows,
                                            img_cols=self._models[0]._img_cols,
                                            verbose=verbose)

        print('Evaluating ensemble')

        score = self._ensemble.evaluate([np.asarray(X_test)] *
                                        len(self._models),
                                        y_test,
                                        verbose=verbose)

        print('Test accuracy:', score[1] * 100, '%')
        print('Test error:', (1 - score[2]) * 100, '%')
Example #8
0
def create_model(file_path=FINAL_MLKNN_MODEL_FILE_PATH):
    """
    Creates and trains a MLkNN classifier using the optimized parameters found
    Saves this trained model to disk

    :param string file_path: specifies where the model should be saved
    :return: a trained sklearn MLkNN classifier
    """

    with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file:
        hyperparameters = json.load(file)['hyperparameters']

    question_data, music_data = preprocessing.load_data()
    question_data, music_data = preprocessing.preprocess_data(
        question_data, music_data)
    clf = MLkNN(k=hyperparameters['k'], s=hyperparameters['s'])
    clf.fit(question_data.values, music_data.values)
    pickle.dump(clf, open(file_path, 'wb'))
    return clf
Example #9
0
def create_model(file_path=FINAL_XGBOOST_MODEL_FILE_PATH):
    """
    Creates and trains a OneVsRestClassifier(XGBClassifier()) using the optimized parameters found
    Saves this trained model to disk

    :param string file_path: specifies where the model should be saved
    :return: a trained OneVsRestClassifier
    """

    with open(OPTIMIZED_MODEL_PARAMETERS_FILE_PATH) as file:
        hyperparameters = json.load(file)['hyperparameters']

    question_data, music_data = preprocessing.load_data()
    question_data, music_data = preprocessing.preprocess_data(
        question_data, music_data)
    xgb_model = XGBClassifier(**hyperparameters)
    xgb_clf = OneVsRestClassifier(xgb_model, n_jobs=-1)
    xgb_clf.fit(question_data, music_data)
    pickle.dump(xgb_clf, open(file_path, 'wb'))
    return xgb_clf
Example #10
0
    def load_data(self, debug=False):
        """Loads starter word-vectors and train/dev/test-split the data."""

        # Load the training set
        X, y, self.word_to_num, self.tag_to_num = preprocess_data(
            dir_path='NKJP_1.2_nltk_POS')

        self.num_to_word = invert_dict(self.word_to_num)
        self.num_to_tag = invert_dict(self.tag_to_num)
        self.tagset_size = len(self.tag_to_num)

        self.X_train, self.X_dev, self.y_train, self.y_dev = train_test_split(
            X, y, test_size=0.2)
        # A hacky way to get 3-part split from 2-part-splitting function
        self.X_dev, self.X_test, self.y_dev, self.y_test = train_test_split(
            self.X_dev, self.y_dev, test_size=0.5)

        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]
Example #11
0
import feature_engineering as feat
from preprocessing import preprocess_data
from decision_function import validate_model, train_model

# MAIN PROGRAM ***************************************************************
SEPERATOR = '=========================================='

acc_knn, acc_svm, acc_dt, acc_df, acc_mlp = 0, 0, 0, 0, 0
ppv_knn, ppv_svm, ppv_dt, ppv_rf, ppv_mlp = 0, 0, 0, 0, 0

# LOAD THE DATA **************************************************************
ticker = input("Enter a ticker symbol: ")
data = csv_to_df(ticker)

# PREPROCESSING & FEATURE ENGINEERING ****************************************
preprocess_data(data)
feat.moving_average(data, 10, 'Close')
feat.moving_average(data, 30, 'Close')
feat.moving_average(data, 20, 'Volume')

# DETERMINING THE BEST DECISION FUNCTION *************************************

# KNN Classifier -------------------------------------------------------------
print(SEPERATOR)
print('KNN Classifier:')
# Test the model
acc_knn, ppv_knn, k = validate_model(data, 'KNN')
# Store the model for making predictions
knn_model = train_model(data, 'KNN', k)
# Identify the Test Population
X = data.iloc[0:, 1:-2]
Example #12
0
from data.utils import load_data
from preprocessing import preprocess_data
from visualization import plot_learning_curves, get_errors_input
from metrics import custom_map_at_k
from feature_selection import get_features_extractor
from data_augmentation import augment_data

print('Augmenting training data set')
augment_data('train.csv', 'train_augmented.csv')

print('Loading training and testing set')
train_data = load_data('train_augmented.csv')
test_data = load_data('test.csv')

print('Preprocessing')
X_train, Y_train = preprocess_data(train_data)
X_test, Y_test = preprocess_data(test_data)

model_name = 'lr'

# print('Loading model')
# model = joblib.load('./models/' + model_name + '_classifier.pkl')
print('Fitting model')
model = Pipeline([
	('features', get_features_extractor()),
	('LogisticRegression', LogisticRegression())
])
model.fit(X_train, Y_train)
print('Saving model')
joblib.dump(model, './models/' + model_name + '_classifier.pkl')
Example #13
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--path", help="address of file", type=str)

    parser.add_argument("--batch_size",
                        help="batch_size",
                        type=int,
                        default=12)
    parser.add_argument("--embedding_size",
                        help="dimension of vectors",
                        default=300,
                        type=int)
    parser.add_argument("--lr", type=float, help="learning rate", default=1e-5)
    parser.add_argument("--decay", help="L2 loss", type=float, default=1e-2)
    parser.add_argument("--iterator",
                        type=int,
                        help="number of iteration",
                        default=10)

    args = parser.parse_args()

    data = load_pickle(args.path)

    context = data["context"]
    question = data["question"]
    answer = data["answer"]

    cxt = []
    query = []
    ans = []

    for c, q, a in zip(context, question, answer):
        cxt.append(c.lower())
        query.append(q.lower())
        ans.append(a.lower())

    cxt = tokenize(cxt)
    query = tokenize(query)
    ans = tokenize(ans)

    word2idx, idx2word = make_dictionary(cxt, query, ans)

    query_ix = convert2idx(query, word2idx)
    context_ix = convert2idx(cxt, word2idx)
    answer_ix = convert2idx(ans, word2idx)

    ##preprocess data
    q_data, c_data, a_data, start_index, end_index = preprocess_data(
        query_ix, context_ix, answer_ix)

    train_data = makeBatch(q_data, c_data, start_index, end_index)

    train_loader = DataLoader(train_data,
                              collate_fn=pad_sequence,
                              batch_size=args.batch_size)
    ################################################################################################

    ## train
    model = BIDAF(
        embedder=WordEmbedder(args.embedding_size, len(word2idx)),
        encoder=Encoder(args.embedding_size, args.embedding_size),
        attention_flow=AttentionFlow(),
        modeling_layer=ModelingLayer(d_vector=args.embedding_size,
                                     bidirectional=True),
        output_layer=OutputLayer(d_vector=args.embedding_size)).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.decay)

    train(model, args.iterator, optimizer, criterion, train_loader)
Example #14
0
def mainFunc(argv):
    def printUsage():
        print('main.py -n <num_cores> -x <experiment>')
        print(
            'num_cores = Number of cores requested from the cluster. Set to -1 to leave unset'
        )
        print(
            'experiment = experiment setup that should be executed. Set to A, B or C'
        )

    num_cores = -1
    num_epochs = NUM_EPOCHS
    experiment = ""
    # Command line argument handling
    try:
        opts, args = getopt.getopt(argv, "n:x:", ["num_cores=", "experiment="])
    except getopt.GetoptError:
        printUsage()
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            printUsage()
            sys.exit()
        elif opt in ("-n", "--num_cores"):
            num_cores = int(arg)
        elif opt in ("-x", "--experiment"):
            if arg in ("A", "B", "C"):
                experiment = arg
            else:
                printUsage()
                sys.exit(2)

    print("Executing experiment {} with {} CPU cores".format(
        experiment, num_cores))
    if num_cores != -1:
        # We set the op_parallelism_threads in the ConfigProto and pass it to the TensorFlow session
        configProto = tf.ConfigProto(log_device_placement=False,
                                     inter_op_parallelism_threads=num_cores,
                                     intra_op_parallelism_threads=num_cores)
    else:
        configProto = tf.ConfigProto(log_device_placement=False)

    print("Building graph")
    graph = None
    # Experiment C required double the default hidden state size
    state_size = CELL_SIZE
    if experiment == "C":
        state_size = 2 * CELL_SIZE

    graph = build_training_graph(state_size=state_size,
                                 downproject_cellsize=CELL_SIZE)

    sentences, index_2_word, word_2_index, _ = preprocessing.preprocess_data(
        TRAINING_DATA_PATH,
        max_sentence_length=MAX_SENTENCE_LENGTH,
        vocabulary_size=VOCABULARY_SIZE)
    sentences_array = np.array(sentences)
    print("Sentences shape is {}".format(sentences_array.shape))

    print("Training network")
    # Use word2vec only for experiment B and C
    useWord2Vec = False
    if experiment in ("B", "C"):
        useWord2Vec = True
    t = time.time()
    train_network(
        graph,
        sentences_array,
        checkpoint_filename="exp{}".format(experiment),
        num_epochs=num_epochs,
        configProto=configProto,
        state_size=state_size,
        vocabulary=word_2_index,  # used in load_embeddings method
        useWord2Vec=useWord2Vec)  # if True, uses word2vec embedding

    print("It took {} seconds to train for {} epochs.".format(
        time.time() - t, num_epochs))
Example #15
0
# -*- coding: utf-8 -*-
"""
Created on Fri May  5 14:06:14 2017

@author: Serotonin
"""
import preprocessing

fname = 'C:\\Users\\Serotonin\\Google Drive\\Flipping\\run_task_photo\\raw_data\\DN3_170818a.txt'
dataframe = preprocessing.preprocess_data(fname)
csv_fname = fname[:-4] + '.csv'
dataframe.to_csv(csv_fname)
Example #16
0
                                          solver="saga",
                                          n_jobs=-1),
                       parameters,
                       scoring="accuracy",
                       n_jobs=-1,
                       cv=5)
else:
    print("classifier should be svm or logreg")

print("loading data...")
train_tweets, train_labels = load_data()
test_tweets = load_test()

if args.prepro:
    print("preprocess data...")
    test_prepro, train_prepro = preprocess_data(test_tweets, train_tweets)
else:
    test_prepro, train_prepro = test_tweets, train_tweets

print(f"load {args.embedding} embeddings")
try:
    embeddings = load_pickle(file)
except FileNotFoundError:
    print(
        "The specified embedding cannot be found, run build_embeddings.py first"
    )
    exit()

print("embedd data...")
test_embedded, train_embedded = embed_data(test_prepro, train_prepro,
                                           embeddings)
Example #17
0
def main():
    parser = create_parser()
    args = parser.parse_args()

    if args.setup:
        create_directories()

    if args.debug:
        dataset = DATASETS['debug']
        args.dataset = "debug"
        features, _, labels, _ = preprocess_data(args.patch_size,
                                                 args.distribution,
                                                 dataset=dataset)
        #print(features, 'debug')
        #print("length of features: ",type(features), len(features),'element.shape: ',features[0][0])
        features_train, features_test = features[:100], features[100:120]
        labels_train, labels_test = labels[:100], labels[100:120]
    elif args.train_model or args.evaluate_model or args.preprocess_data:
        dataset = DATASETS[args.dataset]
        #print(dataset.values())
        load_from_cache = not args.preprocess_data
        try:
            features_train, features_test, labels_train, labels_test = preprocess_data(
                args.patch_size,
                args.distribution,
                dataset=dataset,
                only_cache=load_from_cache)
            #print(features_train, 'train_model or evaluate_model or preprocess_data')
            print("Length of features_train: ", len(features_train))
        except IOError:
            print("Cache file does not exist. Please run again with -p flag.")
            sys.exit(1)

        if args.visualise:
            visualise_labels(labels_train, args.patch_size, LABELS_DIR)
            visualise_labels(labels_test, args.patch_size, LABELS_DIR)

    if not args.model_id:
        timestamp = time.strftime("%d_%m_%Y_%H%M")
        model_id = "{}_{}_{}".format(timestamp, args.dataset,
                                     args.architecture)
    else:
        model_id = args.model_id

    if args.init_model or args.train_model or args.evaluate_model:
        model_dir = os.path.join(OUTPUT_DIR, model_id)
        save_makedirs(model_dir)

    # Hyperparameters for the model. Since there are so many of them it is
    # more convenient to set them in the source code as opposed to passing
    # them as arguments to the Command Line Interface. We use a list of tuples instead of a
    # dict since we want to print the hyperparameters and for that purpose
    # keep them in the predefined order.
    hyperparameters = [
        ("architecture", args.architecture),
        # Hyperparameters for the first convolutional layer.
        ("nb_filters_1", 64),
        ("filter_size_1", 9),
        ("stride_1", (2, 2)),
        # Hyperparameter for the first pooling layer.
        ("pool_size_1", (2, 2)),
        # Hyperparameter for the second convolutional layer (when
        # two layer architecture is used).
        ("nb_filters_2", 128),
        ("filter_size_2", 5),
        ("stride_2", (1, 1)),
        # Hyperparameters for Stochastic Gradient Descent.
        ("learning_rate", 0.05),
        ("momentum", 0.9),
        ("decay", 0.0)
    ]

    hyperparameters_mnih = [
        ("architecture", args.architecture),
        # Hyperparameters for the first convolutional layer.
        ("nb_filters_1", 64),
        ("filter_size_1", 16),
        ("stride_1", (4, 4)),
        # Hyperparameter for the first pooling layer.
        ("pool_size_1", (2, 2)),
        ("pool_stride", 1),
        # Hyperparameter for the second convolutional layer).
        ("nb_filters_2", 112),
        ("filter_size_2", 4),
        ("stride_2", (1, 1)),
        # Hyperparameter for the third convolutional layer).
        ("nb_filters_3", 80),
        ("filter_size_3", 3),
        ("stride_3", (1, 1)),

        # Hyperparameters for Stochastic Gradient Descent.
        ("learning_rate", 0.05),
        ("momentum", 0.9),
        ("decay", 0.0)
    ]

    if args.init_model:
        model = init_model(args.patch_size, model_id,
                           **dict(hyperparameters_mnih))
        save_model_summary(hyperparameters_mnih, model, model_dir)
    elif args.train_model or args.evaluate_model:
        hyperparameters = dict(hyperparameters_mnih)
        model = load_model(model_id)
        model = compile_model(model, hyperparameters["learning_rate"],
                              hyperparameters['momentum'],
                              hyperparameters["decay"])

    if args.train_model:
        model = train_model(model,
                            features_train,
                            labels_train,
                            args.patch_size,
                            model_id,
                            model_dir,
                            nb_epoch=args.epochs,
                            checkpoints=args.checkpoints,
                            tensorboard=args.tensorboard,
                            earlystop=args.earlystop)

    if args.evaluate_model:
        evaluate_model(model,
                       features_test,
                       labels_test,
                       args.patch_size,
                       model_dir,
                       out_format=args.out_format)
Example #18
0
outlet_dim_output = (base_path / "../data/destination/outlet_data.csv")
test_output = (base_path / "../data/algo_files/test.csv")

df_raw = pd.read_csv(file_input)
print(df_raw.head())

# create connection

if __name__ == "__main__":

    #####################################
    #       staging/transform area      #
    #####################################

    # preprocess the data
    df_preprocessed = preprocessing.preprocess_data(df_raw)

    #####################################
    #        transform & load           #
    #####################################

    # Create star schema and load to destination
    star_schema.create_star_schema(
        df_preprocessed, **{
            "fact_output": fact_output,
            "item_dim_output": item_dim_output,
            "outlet_dim_output": outlet_dim_output
        })

    #####################################
    #         Analysis                  #
            train_data = pd.read_csv(path_train,
                                     sep='|',
                                     usecols=['orth',
                                              'translation']).values.tolist()
            val_data = pd.read_csv(path_dev,
                                   sep='|',
                                   usecols=['orth',
                                            'translation']).values.tolist()
            test_data = pd.read_csv(path_test,
                                    sep='|',
                                    usecols=['orth',
                                             'translation']).values.tolist()

            # training data
            src_lang, trg_lang, pairs_train, len_train, trainDF = preprocess_data(
                train_data, lang1, lang2, 'train', False)

            w2i_in = src_lang.word2index
            i2w_in = src_lang.index2word
            w2c_in = src_lang.word2count
            w2i_out = trg_lang.word2index
            i2w_out = trg_lang.index2word
            w2c_out = trg_lang.word2count

            unk_value = False
            pairs_train, src_lang, trg_lang = fix_vocabulary(pairs_train,
                                                             src_lang,
                                                             trg_lang,
                                                             'train',
                                                             unk=unk_value)
Example #20
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("-path", help="address of file", type=str)

    parser.add_argument("-batch_size", help="batch_size", type=int, default=12)
    parser.add_argument("-embedding_size",
                        help="dimension of vectors",
                        default=300,
                        type=int)
    parser.add_argument("-lr", type=float, help="learning rate", default=1e-5)
    parser.add_argument("-decay", help="L2 loss", type=float, default=1e-4)
    parser.add_argument("-iterator",
                        type=int,
                        help="number of iteration",
                        default=10)
    parser.add_argument("-num_iters",
                        type=int,
                        help="decoder iteration",
                        default=4)
    args = parser.parse_args()

    data = load_pickle(args.path)

    context = data["context"][:100]
    question = data["question"][:100]
    answer = data["answer"][:100]

    cxt = []
    query = []
    ans = []

    for c, q, a in zip(context, question, answer):
        cxt.append(c.lower())
        query.append(q.lower())
        ans.append(a.lower())

    cxt = tokenize(cxt)
    query = tokenize(query)
    ans = tokenize(ans)

    word2idx, idx2word = make_dictionary(cxt, query, ans)

    query_ix = convert2idx(query, word2idx)
    context_ix = convert2idx(cxt, word2idx)
    answer_ix = convert2idx(ans, word2idx)

    ##preprocess data
    q_data, c_data, a_data, start_index, end_index = preprocess_data(
        query_ix, context_ix, answer_ix)

    train_data = makeBatch(q_data, c_data, start_index, end_index)

    train_loader = DataLoader(train_data,
                              collate_fn=pad_sequence,
                              batch_size=args.batch_size)
    ################################################################################################

    ## train
    dynamicN = DynamicCN(d_model=args.embedding_size,
                         vocab_size=len(word2idx),
                         iters=args.num_iters)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(dynamicN.parameters())

    train(model=dynamicN,
          iterator=2,
          optimizer=optimizer,
          criterion=criterion,
          train_loader=train_loader)
# Load Hyperparameters
epochs = params['epochs']
batch_size = params['batch_size']
rnn_size = params['rnn_size']
num_layers = params['num_layers']
encoding_embedding_size = params['encoding_embedding_size']
decoding_embedding_size = params['decoding_embedding_size']
learning_rate = params['learning_rate']
learning_rate_decay = params['learning_rate_decay']
min_learning_rate = params['min_learning_rate']
keep_probability = params['keep_probability']

# Preprocess data, get the vocabularies
questions, answers = get_data()
sorted_questions, sorted_answers, questionswords2int, answerswords2int = preprocess_data(
    questions, answers)

# Splitting the questions and answers into training and validation sets
training_validation_split = int(len(sorted_questions) * 0.15)
training_questions = sorted_questions[training_validation_split:]
training_answers = sorted_answers[training_validation_split:]
validation_questions = sorted_questions[:training_validation_split]
validation_answers = sorted_answers[:training_validation_split]

# Training
batch_index_check_validation_loss = (
    (len(training_questions)) // batch_size // 2) - 1
total_training_loss_error = 0
list_validation_loss_error = []
early_stopping_check = 0
Example #22
0
from pandas import read_csv
from preprocessing import preprocess_data, balance_data
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tpot import TPOTClassifier

X_train = read_csv('input/aps_failure_training_set.csv',na_values='na')
X_test = read_csv('input/aps_failure_test_set.csv',na_values='na')

# deal with missing values and constant features and normalize
X_train, X_test, y_train, y_test = preprocess_data(X_train, X_test)
print(f'Data loaded: {len(X_train)} training observations, {len(X_test)} testing observations')

X_train, y_train = balance_data(X_train, y_train, n_samples = 2500)
print(f'Balanced training data ({2500/1000}/1): {len(X_train)} training observations, {len(X_test)} testing observations')

# A custom scorer function is created in order to reflect on the different cost of misclassification (fn > fp)
def scania_scorer(y_true,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()  
    total_cost = 10*fp + 500*fn
    return total_cost

custom_scania_scorer = make_scorer(scania_scorer, greater_is_better=False)

tpot = TPOTClassifier(generations=100, population_size=100, verbosity=3, random_state=42, use_dask=True, n_jobs=-1, memory='auto', early_stop=10, scoring=custom_scania_scorer)
tpot.fit(X_train, y_train)
y_pred = tpot.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Total cost: " + str(scania_scorer(y_test, y_pred)))
Example #23
0
#This file is for running preprocessing only

import logging
import sys
from preprocessing import preprocess_data
import yaml

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger()

#paths
video_root_path = '/content/drive/MyDrive/Grad Project/data/UCSD'
dataset = 'UCSDped1'

#time_length
with open('config.yml', 'r') as ymlfile:
    cfg = yaml.load(ymlfile)
    t = cfg['time_length']
#run preprocessing
preprocess_data(logger, dataset, t, video_root_path)
Example #24
0
    intent = file.read().strip().split("\n")
intent_dict = {}
for i, word in enumerate(intent):
    intent_dict[word] = i


# read data from datafile
df = pd.read_csv("datafile.csv", header=0, delimiter="\t", quoting=3)


# load word2vec model
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin",binary = 'True')


# preprocess data_X
data_x = preprocess_data(df,model)
print("*************")


# onehot encode data_y
data_y = np.array(df["intent"])
for i, word in enumerate(data_y):
    data_y[i] = intent_dict[word]
data_y = np.array(data_y, dtype=np.int8)
nb_classes = len(intent_dict)
data_y = np.eye(nb_classes)[data_y]


# split into train and test
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y, test_size=0.3, random_state=42)
Example #25
0
from learning import QLearning
from preprocessing import preprocess_data

data = preprocess_data()

learner = QLearning(data)
learner.learn()
Example #26
0
from preprocessing import preprocess_data
from model import define_discriminator, define_generator, define_gan
from training import train
from evaluation import evaluate, get_fsl_metrics, resp_vec_correlation, plot_corr
from util.tf_session import setup_tf_session


#%% Setup data and models
# Setup the tf session for possible gpu usage
setup_tf_session()

dataDir = "data"

# Preprocess data
print("Step 0: Preprocessing data...\t", end="", flush=True)
preprocess_data(dataDir)
print("Completed!\n")

# Load data
print("Step 1: Loading and extracting data...\n")

print("Dataset - TRAIN")
dataset_train, train_subjects = data_prep(os.path.join(dataDir, "preprocessed"), True, "train")
print("Dataset - TEST")
dataset_test, test_subjects = data_prep(os.path.join(dataDir, "preprocessed"), True, "test")

image_shape = dataset_train[0].shape[1:]
image_shape = (image_shape[0], image_shape[1], 1)

print("Completed data loading!\n")
# Load features
x_tr, x_te, y_tr = load_data(
    features_folder=features_folder,
    data_folder=data_folder
)


# Pre-processing
if use_preprocessing:
    preprocessing_steps = [LowVarianceFeaturesRemover(), CenterScaler()]
else:
    preprocessing_steps = None
x_tr, x_te, groups_tr, _ = preprocess_data(
    x_tr,
    x_te,
    preprocessing_steps=preprocessing_steps
)


# Classification
clf = classify(
    est=est_list[est_name],
    x_tr=x_tr.values,
    y_tr=y_tr.values.ravel(),
    groups_tr=groups_tr.values,
    x_te=x_te.values,
    test_index=x_te.index,
    perform_evaluation=perform_evaluation,
    perform_cross_validation=perform_cross_validation,
    cv_params=cv_params[est_name],
Example #28
0
                                 '/',
                                 data_folder=data_folder)
    est_list[est_name].set_params(metric='precomputed')
elif est_name[:3] == 'CDF':
    x_tr, x_te, y_tr = load_data(features_folder=cdf_folder,
                                 data_folder=data_folder)
else:
    x_tr, x_te, y_tr = load_data(features_folder=isi_folder,
                                 data_folder=data_folder)

# Pre-process
preprocessing_steps = []
resampling_steps = [RandomUnderSampler()]
x_tr, x_te, groups_tr, y_tr = preprocess_data(
    x_tr,
    x_te,
    y_tr=y_tr,
    preprocessing_steps=preprocessing_steps,
    resampling_steps=resampling_steps)

# Pre-sort the values to speed-up distance computation
if not use_precomputed:
    if est_name in ['KS']:
        x_tr.iloc[:, :] = np.sort(x_tr.values, axis=1)
        x_te.iloc[:, :] = np.sort(x_te.values, axis=1)

# Classification
clf = classify(est=est_list[est_name],
               x_tr=x_tr.values,
               y_tr=y_tr.values.ravel(),
               groups_tr=groups_tr.values,
               x_te=x_te.values,
Example #29
0
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 29 10:12:01 2019

@author: tothp
"""

import ml
import numpy as np
import preprocessing

# prepare frequency data
data = preprocessing.preprocess_data("C:\\projects\\gwas")
frequencies = data["genotype"].count_alleles(data["allele_names"],
                                             standardize_alleles=True)
x = frequencies.loc[:, frequencies.columns != "UME_name"].to_numpy()

#%% plot variance along positions
var = np.std(data["genotype"], axis=0)

# train frequency network

# prepare snp data

# train snp network
Example #30
0
    else:
        ax2.annotate('READY', xy=(0.1, 0.5))


if __name__ == '__main__':
    ordered_keys = sorted(preprocessing.boxes.keys())
    n_boxes = len(ordered_keys)
    f, axarr = plt.subplots(n_boxes, 2, gridspec_kw={'width_ratios': [3, 1]})
    fnames = ['x', 'x', 'x', 'x', 'x']
    while True:
        try:
            df = pd.read_csv(preprocessing.csv_address)
            fnames = df.name
        except:
            pass
        for i in range(n_boxes):
            box_number = ordered_keys[i]
            if fnames[box_number] != 'x':
                try:
                    data = preprocessing.preprocess_data(fnames[box_number])
                    if n_boxes > 1:
                        plotter(axarr[i, 0], data, 60)
                        water(axarr[i, 1], data)
                    else:
                        plotter(axarr[0], data, 60)
                        water(axarr[1], data)
                except:
                    pass
        plt.show()
        plt.pause(0.05)