Exemple #1
0
def train_model(modelBuilder):
    train_df = load_dataframe('train')
    test_df = load_dataframe('test')

    X_train = process(transform_dataset(train_df), isolate)
    X_test = process(transform_dataset(test_df), isolate)

    target_train = train_df['is_iceberg']
    X_train_cv, X_valid, y_train_cv, y_valid = train_test_split(
        X_train, target_train, random_state=1, train_size=0.75)

    model = modelBuilder()
    optimizer = Adam(lr=LEARNING_RATE,
                     beta_1=BETA_1,
                     beta_2=BETA_2,
                     epsilon=EPSILON,
                     decay=DECAY)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    model.summary()

    callbacks = build_save_callbacks(filepath=MODEL_PATH, patience=5)

    datagen = ImageDataGenerator(
        #         featurewise_center=True,
        #         featurewise_std_normalization=True,
        #         rotation_range=20,
        #         width_shift_range=0.2,
        #         height_shift_range=0.2,
        #         horizontal_flip=True
    )
    datagen.fit(X_train)

    empty = ImageDataGenerator()
    empty.fit(X_valid)

    steps_per_epoch = len(X_train_cv) // BATCH_SIZE
    hist = model.fit_generator(datagen.flow(X_train_cv,
                                            y_train_cv,
                                            batch_size=BATCH_SIZE),
                               epochs=EPOCHS,
                               verbose=VERBOSE,
                               validation_data=empty.flow(X_valid, y_valid),
                               steps_per_epoch=steps_per_epoch,
                               callbacks=callbacks)

    model.load_weights(filepath=MODEL_PATH)
    score = model.evaluate(X_valid, y_valid, verbose=1)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    predicted_test = model.predict_proba(X_test)

    save_submission(test_df, predicted_test, filename='sub.csv')
    save_history(hist.history, model_name=MODEL_NAME)
Exemple #2
0
    def load(self, *args, **kwargs):
        self.execute()

        dct = {'fmt': self.fmt}
        dct.update(kwargs)

        return load_dataframe(self.path, *args, **dct)
Exemple #3
0
def main():
    """
        Main application handler
    """
    # in debug mode, use the cached dataframe
    if len(sys.argv) >= 2:
        app.dataframe = utils.load_dataframe("df.pickle")
        app.run(host='0.0.0.0', port=3000, debug=True)
    else:
        app.dataframe = retrieve_stocks()
        utils.save_dataframe(app.dataframe, "df.pickle")
        app.run(host='0.0.0.0', port=3000)
def get_covid_images(covid_dataset_path: str="", covid_metadata_path: str="", shape: tuple=(), save_to: str="", label: int=0) -> int:
    metadata = load_dataframe(file_=covid_metadata_path)

    count = 0
    for (i, row) in tqdm(metadata.iterrows(), ncols=150):
        if row["finding"] == "COVID-19" and row["view"] == "PA":
            count += 1

            covid_image_file = covid_dataset_path + row["filename"].split(os.path.sep)[-1]
            new_image_file = save_to + str(label) + "_" + str(hashlib.sha256(str(time.time()).encode("utf-8")).hexdigest())[:16] + "." + covid_image_file.split(".")[-1]
            
            image = Image.open(covid_image_file)
            image = image.resize(shape)
            image.save(new_image_file)
    
    return count
def main(json_replay: str = "", data_frame_replay: str = ""):
    """ load the carball game analysis json file and data frame """
    analysis = load_json(json_replay)
    data_frame = load_dataframe(data_frame_replay)
    """ plot the game play stats """
    game_stats = get_stats(analysis)
    plot_stats(game_stats)
    """ plot the game play history """
    timeline = Timeline(analysis)
    timeline.plot(show=["goals", "demos"])
    """ plot possession """
    plot_possession(analysis)
    """ ball-hit heatmap """
    heatmap = BallHitHeatmap(analysis)
    heatmap.create_map(down_scale=700)
    """ player/ball-coordinate heatmap and live tracemap (player="ball" possible) """
    playerHeatmap = PositionHeatmap(data_frame, analysis)
    playerHeatmap.create_heatmap(down_scale=500)
    playerHeatmap.animate_tracemap(down_scale=150)

    showReplay = ShowReplay(data_frame, analysis)
    showReplay.animate()
Exemple #6
0
NCOMMENTS = 10000
nmax = 45000
max_features = 7500

FOLDER = './save/'
INPUT_PANDAS = 'comments_extreme.jl'
OUTPUT_MODEL = 'model_sentiment_tfid_logreg_ncomments' + str(
    NCOMMENTS) + '_nfeatures' + str(max_features) + '.sav'
INPUT_VZER = 'vzer_ncomments' + str(nmax) + '_nfeatures' + str(
    max_features) + '.sav'
#OUTPUT_PANDAS = 'comments_extreme_nlength25.jl'
OUTPUT_REPORT = 'report_sentiment_tfid_logreg_ncomments' + str(
    NCOMMENTS) + '_nfeatures' + str(max_features) + '.csv'

df = utils.load_dataframe(FOLDER, INPUT_PANDAS)
df = df.dropna()

#model = RandomForestClassifier(n_estimators=200,
#                               bootstrap = True,
#                               max_features = 'sqrt')
model = RandomForestClassifier()
#model = MultinomialNB()

comments_reduced = df.iloc[:NCOMMENTS]

#model = LogisticRegression()

#Tfmer = TfidfVectorizer(sublinear_tf=True, max_features=max_features, ngram_range=(1, 4), preprocessor=preprocessor)
Tfmer = utils.load_model(FOLDER, INPUT_VZER)
Exemple #7
0
from utils import load_dataframe, select_features, delete_first_day, handle_categorical
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import linear_model

df = load_dataframe('betai/ML/input/FRANCE_ligue1.csv')
df = delete_first_day(df)
print(df.head())
categorical_features = ['FTR', 'HTR']
df = handle_categorical(df, categorical_features)

df = df.dropna()

features_to_keep = ['journee','cl_hometeam','cl_awayteam', 'points_h', 'gagnes_h', 'nuls_h',\
    'perdus_h', 'buts_h', 'contre_h', 'points_a', 'gagnes_a', 'nuls_a', 'perdus_a', \
        'buts_a', 'contre_a', 'forme_h_win', 'forme_h_draw', 'forme_h_lose', \
            'forme_a_win', 'forme_a_draw', 'forme_a_lose', 'FTHG', 'FTAG', 'FTR', \
                'HTHG', 'HTAG', 'HTR', 'HST', 'AST', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', \
                    'B365H', 'B365D', 'B365A']
#df = select_features(df, features_to_keep)

x = df[['cl_hometeam', 'cl_awayteam']]
y = df['FTR']

x_train, x_test, y_train, y_test = train_test_split(x, y)
modele_regLog = linear_model.LogisticRegression(random_state=0,
                                                solver='sag',
                                                multi_class='auto')

modele_regLog.fit(x_train, y_train)
    parser.add_argument('--train-labels', help='training labels', required=True)
    parser.add_argument('--perturb-data', help='perturb data (samples x genes)', required=True)
    parser.add_argument('--perturb-labels', help='perturb labels', required=True)
    parser.add_argument('--gene-sets', help='list of curated gene sets')
    parser.add_argument('--set', help='specific gene set to run')
    parser.add_argument('--tsne', help='plot t-SNE of samples', action='store_true')
    parser.add_argument('--heatmap', help='plot heatmaps of sample perturbations', action='store_true')
    parser.add_argument('--target', help='target class')
    parser.add_argument('--output-dir', help='output directory', default='.')

    args = parser.parse_args()

    # load input data
    print('loading train/perturb data...')

    df_train = utils.load_dataframe(args.train_data)
    df_perturb = utils.load_dataframe(args.perturb_data)

    y_train, classes = utils.load_labels(args.train_labels)
    y_perturb, _ = utils.load_labels(args.perturb_labels, classes)

    print('loaded train data (%s genes, %s samples)' % (df_train.shape[1], df_train.shape[0]))
    print('loaded perturb data (%s genes, %s samples)' % (df_perturb.shape[1], df_perturb.shape[0]))

    # impute missing values
    min_value = df_train.min().min()

    df_train.fillna(value=min_value, inplace=True)
    df_perturb.fillna(value=min_value, inplace=True)

    # sanitize class names
Exemple #9
0
import argparse
import numpy as np
import pandas as pd

import utils

if __name__ == "__main__":
    # parse command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("emx_true", help="true expression matrix")
    parser.add_argument("emx_test", help="test expression matrix")

    args = parser.parse_args()

    # load input dataframes
    emx_true = utils.load_dataframe(args.emx_true)
    emx_test = utils.load_dataframe(args.emx_test)

    print("Loaded %s %s" % (args.emx_true, str(emx_true.shape)))
    print("Loaded %s %s" % (args.emx_test, str(emx_test.shape)))

    # extract data matrix from each dataframe
    X_true = emx_true.values
    X_test = emx_test.values

    # print warnings for various mismatches
    if emx_true.shape != emx_test.shape:
        print("warning: shape does not match")

    if (emx_true.index != emx_test.index).any():
        print("warning: row names do not match")
Exemple #10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
python file that once executed generate
the indexes of the Search engines
"""
import utils
import pandas as pd
import json
import re
import index_utils
from math import log, sqrt
import heapq

# Dataframe
dataframe = utils.load_dataframe().reset_index(drop=True)

# Dataframe (Intro + Plot) Documents
try:
    dataframe_df = pd.read_json(r'Json\dataframe_format_intro_plot.json',
                                orient='table')
except ValueError:
    # Generate (Intro + Plot) Documents
    dataframe_df = index_utils.generate_format_intro_plot_df(dataframe)

# Vocabulary
try:
    vocabulary = pd.read_json(r'Json\vocabulary.json', orient='table')
except ValueError:
    vocabulary = index_utils.generate_vocabulary_df(dataframe_df)
vocabulary_dict = dict(vocabulary['Word'])
Exemple #11
0
                        required=True)
    parser.add_argument("--test-data",
                        help="test data (samples x genes)",
                        required=True)
    parser.add_argument("--test-labels", help="test labels", required=True)
    parser.add_argument("--gene-sets", help="list of curated gene sets")
    parser.add_argument("--set", help="specific gene set to run")
    parser.add_argument("--target", help="target class")
    parser.add_argument("--output-dir", help="Output directory", default=".")

    args = parser.parse_args()

    # load input data
    print("loading train/test data...")

    df_train = utils.load_dataframe(args.train_data)
    df_test = utils.load_dataframe(args.test_data)

    y_train, classes = utils.load_labels(args.train_labels)
    y_test, _ = utils.load_labels(args.test_labels, classes)

    print("loaded train data (%s genes, %s samples)" %
          (df_train.shape[1], df_train.shape[0]))
    print("loaded test data (%s genes, %s samples)" %
          (df_test.shape[1], df_test.shape[0]))

    # impute missing values
    min_value = df_train.min().min()

    df_train.fillna(value=min_value, inplace=True)
    df_test.fillna(value=min_value, inplace=True)
Exemple #12
0
                        help='perturb data (samples x genes)',
                        required=True)
    parser.add_argument('--perturb-labels',
                        help='perturb labels',
                        required=True)
    parser.add_argument('--gene-sets', help='list of curated gene sets')
    parser.add_argument('--set', help='specific gene set to run')
    parser.add_argument('--target', help='target class')
    parser.add_argument('--output-dir', help='Output directory', default='.')

    args = parser.parse_args()

    # load input data
    print('loading train/perturb data...')

    df_train = utils.load_dataframe(args.train_data)
    df_perturb = utils.load_dataframe(args.perturb_data)

    y_train, classes = utils.load_labels(args.train_labels)
    y_perturb, _ = utils.load_labels(args.perturb_labels, classes)

    print('loaded train data (%s genes, %s samples)' %
          (df_train.shape[1], df_train.shape[0]))
    print('loaded perturb data (%s genes, %s samples)' %
          (df_perturb.shape[1], df_perturb.shape[0]))

    # impute missing values
    min_value = df_train.min().min()

    df_train.fillna(value=min_value, inplace=True)
    df_perturb.fillna(value=min_value, inplace=True)
 def setUp(self):
     self.flow_df = load_dataframe(STORE_PATH, 'test_flow')
     self.flow_df.set_index('timestamp')
Exemple #14
0
 def setUp(self):
     flow_df = load_dataframe(STORE_PATH, 'test_flow')
     flow_df.set_index('timestamp')
     self.rtt_df = rtts_from_timestamps(('test_flow', flow_df))
Exemple #15
0
                        help="upper bound of x-axis",
                        type=float)
    parser.add_argument("--tsne", help="save t-SNE plot to the given filename")
    parser.add_argument("--tsne-na",
                        help="numerical value to use for missing values",
                        type=float,
                        default=-1e3)
    parser.add_argument(
        "--tsne-npca",
        help="number of principal components to take before t-SNE",
        type=int)

    args = parser.parse_args()

    # load input expression matrix
    emx = utils.load_dataframe(args.infile)

    print("Loaded %s %s" % (args.infile, str(emx.shape)))

    # load label file or generate empty labels
    if args.labels != None:
        print("Loading label file...")

        labels = np.loadtxt(args.labels, dtype=str)
    else:
        labels = np.zeros(len(emx.columns), dtype=str)

    # plot sample distributions
    if args.density != None:
        print("Plotting sample distributions...")
Exemple #16
0
if __name__ == "__main__":
	# parse command-line arguments
	parser = argparse.ArgumentParser()
	parser.add_argument("--dataset", help="input dataset (samples x genes)", required=True)
	parser.add_argument("--labels", help="list of sample labels", required=True)
	parser.add_argument("--gene-sets", help="list of curated gene sets")
	parser.add_argument("--target", help="target class")
	parser.add_argument("--set", help="gene set to run", type=str, default="HALLMARK_ALL")
	parser.add_argument("--output-dir", help="Output directory", default=".")

	args = parser.parse_args()

	# load input data
	print("loading input dataset...")

	df = utils.load_dataframe(args.dataset)
	df_samples = df.index
	df_genes = df.columns

	labels, classes = utils.load_labels(args.labels)

	print("loaded input dataset (%s genes, %s samples)" % (df.shape[1], df.shape[0]))

	# impute missing values
	df.fillna(value=df.min().min(), inplace=True)

	# determine target class
	try:
		if args.target == None:
			args.target = -1
		else:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

from utils import load_numpy, load_dataframe

data, labels = load_dataframe(r"processed.cleveland.data")

# Normalize data
scaler = StandardScaler()
data = scaler.fit_transform(data)

# Visualize using PCA
data_after_pca = PCA(n_components=2).fit_transform(data)
fig, ax = plt.subplots()
ax.scatter(data_after_pca[labels == 0, 0],
           data_after_pca[labels == 0, 1],
           c='red',
           label='Class 1 (No disease)')
ax.scatter(data_after_pca[labels > 0, 0],
           data_after_pca[labels > 0, 1],
           c='blue',
           label='Class 2 (Some kind of disease)')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
ax.legend()
plt.show()