def train_model(modelBuilder): train_df = load_dataframe('train') test_df = load_dataframe('test') X_train = process(transform_dataset(train_df), isolate) X_test = process(transform_dataset(test_df), isolate) target_train = train_df['is_iceberg'] X_train_cv, X_valid, y_train_cv, y_valid = train_test_split( X_train, target_train, random_state=1, train_size=0.75) model = modelBuilder() optimizer = Adam(lr=LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2, epsilon=EPSILON, decay=DECAY) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.summary() callbacks = build_save_callbacks(filepath=MODEL_PATH, patience=5) datagen = ImageDataGenerator( # featurewise_center=True, # featurewise_std_normalization=True, # rotation_range=20, # width_shift_range=0.2, # height_shift_range=0.2, # horizontal_flip=True ) datagen.fit(X_train) empty = ImageDataGenerator() empty.fit(X_valid) steps_per_epoch = len(X_train_cv) // BATCH_SIZE hist = model.fit_generator(datagen.flow(X_train_cv, y_train_cv, batch_size=BATCH_SIZE), epochs=EPOCHS, verbose=VERBOSE, validation_data=empty.flow(X_valid, y_valid), steps_per_epoch=steps_per_epoch, callbacks=callbacks) model.load_weights(filepath=MODEL_PATH) score = model.evaluate(X_valid, y_valid, verbose=1) print('Test loss:', score[0]) print('Test accuracy:', score[1]) predicted_test = model.predict_proba(X_test) save_submission(test_df, predicted_test, filename='sub.csv') save_history(hist.history, model_name=MODEL_NAME)
def load(self, *args, **kwargs): self.execute() dct = {'fmt': self.fmt} dct.update(kwargs) return load_dataframe(self.path, *args, **dct)
def main(): """ Main application handler """ # in debug mode, use the cached dataframe if len(sys.argv) >= 2: app.dataframe = utils.load_dataframe("df.pickle") app.run(host='0.0.0.0', port=3000, debug=True) else: app.dataframe = retrieve_stocks() utils.save_dataframe(app.dataframe, "df.pickle") app.run(host='0.0.0.0', port=3000)
def get_covid_images(covid_dataset_path: str="", covid_metadata_path: str="", shape: tuple=(), save_to: str="", label: int=0) -> int: metadata = load_dataframe(file_=covid_metadata_path) count = 0 for (i, row) in tqdm(metadata.iterrows(), ncols=150): if row["finding"] == "COVID-19" and row["view"] == "PA": count += 1 covid_image_file = covid_dataset_path + row["filename"].split(os.path.sep)[-1] new_image_file = save_to + str(label) + "_" + str(hashlib.sha256(str(time.time()).encode("utf-8")).hexdigest())[:16] + "." + covid_image_file.split(".")[-1] image = Image.open(covid_image_file) image = image.resize(shape) image.save(new_image_file) return count
def main(json_replay: str = "", data_frame_replay: str = ""): """ load the carball game analysis json file and data frame """ analysis = load_json(json_replay) data_frame = load_dataframe(data_frame_replay) """ plot the game play stats """ game_stats = get_stats(analysis) plot_stats(game_stats) """ plot the game play history """ timeline = Timeline(analysis) timeline.plot(show=["goals", "demos"]) """ plot possession """ plot_possession(analysis) """ ball-hit heatmap """ heatmap = BallHitHeatmap(analysis) heatmap.create_map(down_scale=700) """ player/ball-coordinate heatmap and live tracemap (player="ball" possible) """ playerHeatmap = PositionHeatmap(data_frame, analysis) playerHeatmap.create_heatmap(down_scale=500) playerHeatmap.animate_tracemap(down_scale=150) showReplay = ShowReplay(data_frame, analysis) showReplay.animate()
NCOMMENTS = 10000 nmax = 45000 max_features = 7500 FOLDER = './save/' INPUT_PANDAS = 'comments_extreme.jl' OUTPUT_MODEL = 'model_sentiment_tfid_logreg_ncomments' + str( NCOMMENTS) + '_nfeatures' + str(max_features) + '.sav' INPUT_VZER = 'vzer_ncomments' + str(nmax) + '_nfeatures' + str( max_features) + '.sav' #OUTPUT_PANDAS = 'comments_extreme_nlength25.jl' OUTPUT_REPORT = 'report_sentiment_tfid_logreg_ncomments' + str( NCOMMENTS) + '_nfeatures' + str(max_features) + '.csv' df = utils.load_dataframe(FOLDER, INPUT_PANDAS) df = df.dropna() #model = RandomForestClassifier(n_estimators=200, # bootstrap = True, # max_features = 'sqrt') model = RandomForestClassifier() #model = MultinomialNB() comments_reduced = df.iloc[:NCOMMENTS] #model = LogisticRegression() #Tfmer = TfidfVectorizer(sublinear_tf=True, max_features=max_features, ngram_range=(1, 4), preprocessor=preprocessor) Tfmer = utils.load_model(FOLDER, INPUT_VZER)
from utils import load_dataframe, select_features, delete_first_day, handle_categorical import seaborn as sns from sklearn.model_selection import train_test_split from sklearn import linear_model df = load_dataframe('betai/ML/input/FRANCE_ligue1.csv') df = delete_first_day(df) print(df.head()) categorical_features = ['FTR', 'HTR'] df = handle_categorical(df, categorical_features) df = df.dropna() features_to_keep = ['journee','cl_hometeam','cl_awayteam', 'points_h', 'gagnes_h', 'nuls_h',\ 'perdus_h', 'buts_h', 'contre_h', 'points_a', 'gagnes_a', 'nuls_a', 'perdus_a', \ 'buts_a', 'contre_a', 'forme_h_win', 'forme_h_draw', 'forme_h_lose', \ 'forme_a_win', 'forme_a_draw', 'forme_a_lose', 'FTHG', 'FTAG', 'FTR', \ 'HTHG', 'HTAG', 'HTR', 'HST', 'AST', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', \ 'B365H', 'B365D', 'B365A'] #df = select_features(df, features_to_keep) x = df[['cl_hometeam', 'cl_awayteam']] y = df['FTR'] x_train, x_test, y_train, y_test = train_test_split(x, y) modele_regLog = linear_model.LogisticRegression(random_state=0, solver='sag', multi_class='auto') modele_regLog.fit(x_train, y_train)
parser.add_argument('--train-labels', help='training labels', required=True) parser.add_argument('--perturb-data', help='perturb data (samples x genes)', required=True) parser.add_argument('--perturb-labels', help='perturb labels', required=True) parser.add_argument('--gene-sets', help='list of curated gene sets') parser.add_argument('--set', help='specific gene set to run') parser.add_argument('--tsne', help='plot t-SNE of samples', action='store_true') parser.add_argument('--heatmap', help='plot heatmaps of sample perturbations', action='store_true') parser.add_argument('--target', help='target class') parser.add_argument('--output-dir', help='output directory', default='.') args = parser.parse_args() # load input data print('loading train/perturb data...') df_train = utils.load_dataframe(args.train_data) df_perturb = utils.load_dataframe(args.perturb_data) y_train, classes = utils.load_labels(args.train_labels) y_perturb, _ = utils.load_labels(args.perturb_labels, classes) print('loaded train data (%s genes, %s samples)' % (df_train.shape[1], df_train.shape[0])) print('loaded perturb data (%s genes, %s samples)' % (df_perturb.shape[1], df_perturb.shape[0])) # impute missing values min_value = df_train.min().min() df_train.fillna(value=min_value, inplace=True) df_perturb.fillna(value=min_value, inplace=True) # sanitize class names
import argparse import numpy as np import pandas as pd import utils if __name__ == "__main__": # parse command-line arguments parser = argparse.ArgumentParser() parser.add_argument("emx_true", help="true expression matrix") parser.add_argument("emx_test", help="test expression matrix") args = parser.parse_args() # load input dataframes emx_true = utils.load_dataframe(args.emx_true) emx_test = utils.load_dataframe(args.emx_test) print("Loaded %s %s" % (args.emx_true, str(emx_true.shape))) print("Loaded %s %s" % (args.emx_test, str(emx_test.shape))) # extract data matrix from each dataframe X_true = emx_true.values X_test = emx_test.values # print warnings for various mismatches if emx_true.shape != emx_test.shape: print("warning: shape does not match") if (emx_true.index != emx_test.index).any(): print("warning: row names do not match")
#!/usr/bin/env python # -*- coding: utf-8 -*- """ python file that once executed generate the indexes of the Search engines """ import utils import pandas as pd import json import re import index_utils from math import log, sqrt import heapq # Dataframe dataframe = utils.load_dataframe().reset_index(drop=True) # Dataframe (Intro + Plot) Documents try: dataframe_df = pd.read_json(r'Json\dataframe_format_intro_plot.json', orient='table') except ValueError: # Generate (Intro + Plot) Documents dataframe_df = index_utils.generate_format_intro_plot_df(dataframe) # Vocabulary try: vocabulary = pd.read_json(r'Json\vocabulary.json', orient='table') except ValueError: vocabulary = index_utils.generate_vocabulary_df(dataframe_df) vocabulary_dict = dict(vocabulary['Word'])
required=True) parser.add_argument("--test-data", help="test data (samples x genes)", required=True) parser.add_argument("--test-labels", help="test labels", required=True) parser.add_argument("--gene-sets", help="list of curated gene sets") parser.add_argument("--set", help="specific gene set to run") parser.add_argument("--target", help="target class") parser.add_argument("--output-dir", help="Output directory", default=".") args = parser.parse_args() # load input data print("loading train/test data...") df_train = utils.load_dataframe(args.train_data) df_test = utils.load_dataframe(args.test_data) y_train, classes = utils.load_labels(args.train_labels) y_test, _ = utils.load_labels(args.test_labels, classes) print("loaded train data (%s genes, %s samples)" % (df_train.shape[1], df_train.shape[0])) print("loaded test data (%s genes, %s samples)" % (df_test.shape[1], df_test.shape[0])) # impute missing values min_value = df_train.min().min() df_train.fillna(value=min_value, inplace=True) df_test.fillna(value=min_value, inplace=True)
help='perturb data (samples x genes)', required=True) parser.add_argument('--perturb-labels', help='perturb labels', required=True) parser.add_argument('--gene-sets', help='list of curated gene sets') parser.add_argument('--set', help='specific gene set to run') parser.add_argument('--target', help='target class') parser.add_argument('--output-dir', help='Output directory', default='.') args = parser.parse_args() # load input data print('loading train/perturb data...') df_train = utils.load_dataframe(args.train_data) df_perturb = utils.load_dataframe(args.perturb_data) y_train, classes = utils.load_labels(args.train_labels) y_perturb, _ = utils.load_labels(args.perturb_labels, classes) print('loaded train data (%s genes, %s samples)' % (df_train.shape[1], df_train.shape[0])) print('loaded perturb data (%s genes, %s samples)' % (df_perturb.shape[1], df_perturb.shape[0])) # impute missing values min_value = df_train.min().min() df_train.fillna(value=min_value, inplace=True) df_perturb.fillna(value=min_value, inplace=True)
def setUp(self): self.flow_df = load_dataframe(STORE_PATH, 'test_flow') self.flow_df.set_index('timestamp')
def setUp(self): flow_df = load_dataframe(STORE_PATH, 'test_flow') flow_df.set_index('timestamp') self.rtt_df = rtts_from_timestamps(('test_flow', flow_df))
help="upper bound of x-axis", type=float) parser.add_argument("--tsne", help="save t-SNE plot to the given filename") parser.add_argument("--tsne-na", help="numerical value to use for missing values", type=float, default=-1e3) parser.add_argument( "--tsne-npca", help="number of principal components to take before t-SNE", type=int) args = parser.parse_args() # load input expression matrix emx = utils.load_dataframe(args.infile) print("Loaded %s %s" % (args.infile, str(emx.shape))) # load label file or generate empty labels if args.labels != None: print("Loading label file...") labels = np.loadtxt(args.labels, dtype=str) else: labels = np.zeros(len(emx.columns), dtype=str) # plot sample distributions if args.density != None: print("Plotting sample distributions...")
if __name__ == "__main__": # parse command-line arguments parser = argparse.ArgumentParser() parser.add_argument("--dataset", help="input dataset (samples x genes)", required=True) parser.add_argument("--labels", help="list of sample labels", required=True) parser.add_argument("--gene-sets", help="list of curated gene sets") parser.add_argument("--target", help="target class") parser.add_argument("--set", help="gene set to run", type=str, default="HALLMARK_ALL") parser.add_argument("--output-dir", help="Output directory", default=".") args = parser.parse_args() # load input data print("loading input dataset...") df = utils.load_dataframe(args.dataset) df_samples = df.index df_genes = df.columns labels, classes = utils.load_labels(args.labels) print("loaded input dataset (%s genes, %s samples)" % (df.shape[1], df.shape[0])) # impute missing values df.fillna(value=df.min().min(), inplace=True) # determine target class try: if args.target == None: args.target = -1 else:
from sklearn.decomposition import PCA from sklearn.preprocessing import MinMaxScaler, StandardScaler import matplotlib.pyplot as plt from utils import load_numpy, load_dataframe data, labels = load_dataframe(r"processed.cleveland.data") # Normalize data scaler = StandardScaler() data = scaler.fit_transform(data) # Visualize using PCA data_after_pca = PCA(n_components=2).fit_transform(data) fig, ax = plt.subplots() ax.scatter(data_after_pca[labels == 0, 0], data_after_pca[labels == 0, 1], c='red', label='Class 1 (No disease)') ax.scatter(data_after_pca[labels > 0, 0], data_after_pca[labels > 0, 1], c='blue', label='Class 2 (Some kind of disease)') plt.xlabel('Component 1') plt.ylabel('Component 2') ax.legend() plt.show()