def get_attention_metric(file_name: str): """ This function calculates attention_metric based on the given filename or folder """ """ Inputs: file_name is a string. It can be either a file_name or name of the folder, incase which all the data inside that folder will be vstacked Outpus: Attention_metric """ path = Path(file_name) if (os.path.isfile(path)): data = d.load_pickle_file(path) elif os.path.isdir(path): data = d.load_all_pickle_files(path, vstack=True) else: print(f"The path {path} does not exist") clean_data = d.clean_data(data, channels=range(8), threshold_fn=fr.max_amplitude) print(f"Clean data shape:{clean_data.shape}") _bandpower = d.epoch_bandpower(clean_data, per_epoch=1, channels=range(8), relative=False) _metrics = d.metric_1(_bandpower) return _metrics
def main(): query = get_query(start_date='20160801', end_date='20170830') print(query) private_key = load_private_key() df = load_data(query=query, project_id=private_key['project_id'], private_key=private_key) df = clean_data(df) print(df.info()) X = df.drop(columns=[ 'fullVisitorId', 'visitId', 'visitStartTime', 'country', 'medium', 'lifetime_total_revenue' ]).values y = df['lifetime_total_revenue'].values store_training_data(clean_data(df)) validation_df = model.validation(X, y) print(validation_df.head()) return df
def perform_pca(dataframe, num_dimensions): """ Perform PCA analysis on the given dataframe :param dataframe: The dataframe of source data, first column is the label column :param num_dimensions: the number of dimensions to reduce to :return: """ # Clean/Standardize the Data print "Standardizing Data" std_df = data.clean_data(dataframe) # Compute the covariance matrix print "Computing Covariance Matrix" cov_matrix = std_df.cov() # Compute Eigenvectors + Eigenvalues of Covariance Matrix print "Finding Eigenvalues and Eigenvectors" evalues, evectors = np.linalg.eig(cov_matrix) # Since evalues are not guarenteed sorted, we need to keep a mapping to the eigenvectors and sort the values index = {evalues[i]: i for i in xrange(len(evalues))} # sort the evalues highest to lowest evalues.sort() evalues[:] = evalues[::-1] # Pick m<d eigenvectors with highest eigenvalues selected_indicies = [index[val] for val in evalues[:num_dimensions]] # Build projection matrix print "Projecting Data to {0} Dimensions".format(num_dimensions) projection_matrix = evectors[:, selected_indicies] # Project original dataframe using projection matrix new_data = np.dot(std_df, projection_matrix) # Preserve the labels labels = dataframe.index # Return new dataframe projected_df = pd.DataFrame(new_data, index=labels) projected_df.columns = [ "Feature #{0}".format(i) for i in projected_df.columns ] return projected_df
self.pipeline = pipe_cols def run(self): """set and train the pipeline""" pipeline = self.set_pipeline() self.pipeline.fit(self.X, self.y) def evaluate(self, X_test, y_test): """evaluates the pipeline on df_test and return the RMSE""" y_pred = self.pipeline.predict(X_test) rmse = compute_rmse(y_pred, y_test) return rmse if __name__ == "__main__": # get data$ df = get_data(nrows=10_000) # clean data df = clean_data(df, test=False) # set X and y X = df.drop(columns='fare_amount') y = df['fare_amount'] # hold out X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # train trainer = Trainer(X_train, y_train) trainer.run() # evaluate trainer.evaluate(X_test, y_test) print('ok')
def kmeans(dataframe, col_1_index, col_2_index, k, output_path, columns, cluster_colors=['r', 'b']): """ Perform Clustering using K-Means :param dataframe: :param col_1_index: The index of the first feature to plot :param col_2_index: The index of the second feature to plot :param k: The k value for k-means :param output_path: The path to save the graphs to. :param columns: The list of columns to use when computing distance. :param cluster_colors: A list of colors to use when displaying clusters in graphs. :return: """ # Standardize Data print "Standardizing Data" std_df = data.clean_data(dataframe) # Seed Random Number Generator with '0' random.seed(0) # Randomly select k data instances and use for initial means print "Selecting Initial Centers" cluster_centers = [ std_df.iloc[random.randrange(0, len(std_df) - 1)] for i in xrange(k) ] # Plot the initial Setup plot_setup(cluster_centers, col_1_index, col_2_index, output_path, std_df) plt.clf() # Do K-Means Algorithm # Keep Count of Iterations membership = {} iteration = 0 while True: plt_name = "InitialAssignment" plt_title = "Initial Assignment" if iteration > 0: plt_name = "Iteration{0}".format(iteration) plt_title = "Iteration #{0}".format(iteration) # Assign Membership print "Assigning Membership" membership = assign_membership(std_df, cluster_centers, columns) # Plot Cluster Assignments plt.clf() print "Plotting Assignments" plot_iteration(cluster_centers, cluster_colors, col_1_index, col_2_index, k, membership, output_path, std_df, plt_name, plt_title) # Update Center # For each 'cluster' compute the mean point new_cluster_centers = [] print "Computing new Cluster Centers" for cluster_i in xrange(len(cluster_centers)): indices = membership[cluster_i] new_cluster_centers.append(std_df.iloc[indices].mean()) # Compute difference between prior center and new center, if < 1eps, exit difference = sys.maxint for cluster_i in xrange(len(cluster_centers)): old_point = cluster_centers[cluster_i] new_point = new_cluster_centers[cluster_i] difference = (old_point - new_point).max() print "Iteration #{0}, Difference: {1}".format(iteration, difference) if difference <= np.spacing(1): # Matlab's Eps == np.spacing(1) break cluster_centers = new_cluster_centers iteration += 1 #Plot final cluster assignments plot_iteration(cluster_centers, cluster_colors, col_1_index, col_2_index, k, membership, output_path, std_df, "FinalAssignment", "Final Cluster Assignments")
with open(CONFIGFILE, "r") as f: config = yaml.load(f) ap = ArgumentParser() ap.add_argument('--inspect_data', action='store_true', default=False, help="plot training data for inspection") ap.add_argument('--train', action='store_true', default=False, help="Run training") ap.add_argument('--test', action='store_true', default=False, help="Run test") args = ap.parse_args() df = load_data() milk = clean_data(df) train, test = train_test_split(milk) if args.inspect_data: print("RAW DATA") print(df.head()) milk.plot() plt.show() elif args.train: model = LSTMPredictor(config) model.fit(train['Milk Production'].values.reshape(1, -1)) model.close() elif args.test: model = LSTMPredictor(config) y_pred = model.infer(train['Milk Production'].values.reshape(1, -1), 12) y_pred = list(y_pred)
from data import import_data, clean_data, prepare_model_data from models import model_statistics from model_diagnostics import ensemble_prediction from model_diagnostics import partial_dependence, partial_dependence_loop, plot_top_partial_dependences import numpy as np import pandas as pd import pickle from sklearn.metrics import accuracy_score, f1_score from tensorflow import keras # Data df = import_data() clean_df = clean_data(df) X_train, X_test, y_train, y_test, scalers, column_names = prepare_model_data( df=clean_df, y_col='Bankrupt?') # Load Models rf_clf = pickle.load(open('Models/rf.sav', 'rb')) xgb_clf = pickle.load(open('Models/xgb.sav', 'rb')) nn_clf = keras.models.load_model('Models/nn.h5') knn_clf = pickle.load(open('Models/knn.sav', 'rb')) lr_clf = pickle.load(open('Models/lr.sav', 'rb')) model_stats_10 = pd.read_csv('Models/model_stats_10.csv') model_stats_25 = pd.read_csv('Models/model_stats_25.csv') model_stats_50 = pd.read_csv('Models/model_stats_50.csv') model_stats_75 = pd.read_csv('Models/model_stats_75.csv') # Ensemble Prediction # train_ens_votes, train_ens_pred = ensemble_prediction(
def main(): #output directory output_dir = Path('../output/') #setup logging output_dir.mkdir(parents=True, exist_ok=True) logfile_path = Path(output_dir / "output.log") setup_logging(logfile = logfile_path) #reading the config file config_file = Path('../config.ini') reading_config(config_file) #dataset paths rworldnews_path = Path(Config.get("rworldnews")) millionnews_path = Path(Config.get("millionnews")) #loading the dataset raw_data = load_data(rworldnews_path) #raw_data = load_data(millionnews_path) dates, labels, news = clean_data(raw_data) id_to_word, word_to_id = dictionary(news, threshold = 5) training_loader, validation_loader, testing_loader = data_loaders(rworldnews_path) #tensorboard #loading pretrained embeddings pretrained_emb_file = Path(Config.get("pretrained_emb_path")) pretrained_embeddings, emb_dim = load_pretrained_embeddings(pretrained_emb_file, id_to_word) #text classification model num_classes = 2 model = TextClassifier(pretrained_embeddings, emb_dim, num_classes) #load the optimizer learning_rate = Config.get("learning_rate") optimizer = adam_optimizer(model, learning_rate) #load the loss function criterion = cross_entropy #load checkpoint checkpoint_file = Path(output_dir / Config.get("checkpoint_file")) checkpoint_stocks = load_checkpoint(checkpoint_file) #using available device(gpu/cpu) model = model.to(Config.get("device")) pretrained_embeddings = pretrained_embeddings.to(Config.get("device")) #intializing the model and optimizer from the save checkpoint. start_epoch = 1 if checkpoint_stocks is not None: start_epoch = checkpoint_stocks['epoch'] + 1 model.load_state_dict(checkpoint_stocks['model']) optimizer.load_state_dict(checkpoint_stocks['optimizer']) logger.info('Initialized model and the optimizer from loaded checkpoint.') del checkpoint_stocks #stock prediction model model = StockPrediction(model, optimizer, criterion, training_loader, validation_loader, testing_loader, output_dir) #training and testing the model epochs = Config.get("epochs") validate_every = Config.get("validate_every") model.train(epochs, validate_every, start_epoch)
import data import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV from sklearn import metrics from sklearn import preprocessing df = data.load_data('C:\\Users\\catic\\Documents\\EECE 2300\\python\\crime_term_project\\data\\raw\\communities.data.txt') df2 = data.summarize_data(df) df_attributes = data.label_data('C:\\Users\\catic\\Documents\\EECE 2300\\python\\crime_term_project\\data\\raw\\communities.attributes.txt', df2) cleaned_df = data.clean_data(df_attributes) cleaned_df = cleaned_df.replace('?', np.NaN) cleaned_df = cleaned_df.dropna(axis=0) x = cleaned_df.drop(['communityname','ViolentCrimesPerPop'], axis = 1) y = cleaned_df['ViolentCrimesPerPop'] x_labels = x.columns x_as_array = x.values min_max_scale = preprocessing.MinMaxScaler() x_scaled = min_max_scale.fit_transform(x_as_array) x = pd.DataFrame(x_scaled) x.columns = x_labels train_x, test_x, train_y, test_y = train_test_split(x,y, test_size=.3, random_state=1) def linreg(x,y): """ FUnction for linear regression
args = parser.parse_args() if(not(args.plot_raw_data) and not(args.plot_standardized_data) and not(args.perform_pca) and not(args.perform_kmeans)): parser.print_help() plt.style.use(args.style) df = data.read_data(args.data_filepath) if(args.plot_raw_data): plotting.plot_all_data(df, "Raw Data", "Raw", os.path.join(args.output_folderpath, "raw"), data.column_names) if(args.plot_standardized_data): clean_df = data.clean_data(df) plotting.plot_all_data(clean_df, "Standardized Data", "Clean", os.path.join(args.output_folderpath, "clean"), data.column_names) if(args.perform_pca): num_dimensions = args.num_dimensions projected_df = pca.perform_pca(df, num_dimensions) output_path = os.path.join(args.output_folderpath, "PCA") if(not(os.path.exists(output_path))): os.makedirs(output_path) print "Saving Graphs" plotting.plot_all_data(projected_df, "PCA {0}-D".format(num_dimensions), "PCA", output_path, projected_df.columns) if(args.perform_kmeans):
# Creating a data frame from scraped content using pandas dataset = pd.DataFrame(data_content) # Creating column headings headers = rows[0].find_all('th') headers = [header.get_text().strip('\n') for header in headers] headers += [ 'Total Area', 'Percentage Water', 'Total Nominal GDP', 'Per Capita GDP' ] dataset.columns = headers # Dropping columns from dataset that we don't want to use drop_columns = ['Rank', 'Date', 'Source'] dataset.drop(drop_columns, axis=1, inplace=True) dataset.sample(3) dataset.to_csv("dataset.csv", index=False) # Reading the dataset using pandas dataset = pd.read_csv("dataset.csv") # Renaming headings dataset.rename(columns={'Country(or dependent territory)': 'Country'}, inplace=True) dataset.rename( columns={'% of worldpopulation': 'Percentage of World Population'}, inplace=True) dataset.rename(columns={'Total Area': 'Total Area (km2)'}, inplace=True) # Formatting data data.clean_data(dataset)