import matplotlib.pyplot as plt import pandas as pd from sklearn.cluster import KMeans import functions as f # import data df = f.openfile("data.h5") periods = df.year.unique() fig = plt.figure(figsize=(20, 20)) for c, num in zip(periods, range(1, 12)): # subset every period features = df[df['year'] == c].drop(columns=['year', 'country']) ax = fig.add_subplot(5, 3, num) # perform elbow method Error = [] for i in range(1, 15): kmeans = KMeans(n_clusters=i).fit(features) Error.append(kmeans.inertia_) ax.plot(range(1, 15), Error) plt.title('Elbow Method') plt.xlabel('Number of Clusters') plt.ylabel('Inertia') plt.tight_layout()
import functions as f import matplotlib.pyplot as plt import numpy as np import time as t t0 = t.clock() kplr_id = '008191672' kplr_file = 'kplr008191672-2013011073258_llc.fits' jdadj, obsobject, lightdata = f.openfile(kplr_id, kplr_file) time, flux, flux_err = f.fix_data(lightdata) flux, variance = f.rescale(flux, flux_err) time -= np.median(time) depth = 0.00650010001 width = 0.177046694669 period_interval = np.arange(2.00, 8.0, 0.01) offset_intervals = np.arange(0.00, 7.2, 0.01) #Change to numpy arrays to optimize. # z = [[f.sum_chi_squared(flux, f.box(p,o,depth,width,time),variance) for o in offset_intervals] # for p in period_interval] z = [] for p in period_interval: line = [] for o in offset_intervals: if o < p: line.append(
import functions as f import matplotlib.pyplot as plt import numpy as np import time as t t0 = t.clock() kplr_id = '008191672' kplr_file = 'kplr008191672-2013011073258_llc.fits' jdadj, obsobject, lightdata = f.openfile(kplr_id, kplr_file) time, flux, flux_err = f.fix_data(lightdata) flux, variance = f.rescale(flux, flux_err) time -= np.median(time) depth = 0.00650010001 width = 0.177046694669 period_interval = np.arange(2.00, 8.0, 0.01) offset_intervals = np.arange(0.00, 7.2, 0.01) #Change to numpy arrays to optimize. # z = [[f.sum_chi_squared(flux, f.box(p,o,depth,width,time),variance) for o in offset_intervals] # for p in period_interval] z = [] for p in period_interval: line = [] for o in offset_intervals: if o < p: line.append(f.sum_chi_squared(flux, f.box(p,o,depth,width,time), variance))
import functions as f import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA import matplotlib.pyplot as plt data = f.openfile('data.h5') '''reading in data, setting target values as y and variables as x applying the string conversion of years (as they are more categorical rather than numerical in this instance) creating dummy variables and standardising the independent variables (aka x) and dependent or target variable y''' y = np.array(data['Net migration']) x = data.drop('Net migration', axis=1) x['year'] = x['year'].apply(lambda x: str(x.year)) x = pd.get_dummies(x) features_list = list(x.columns) x = StandardScaler().fit_transform(x) y = y.reshape(-1, 1) y = StandardScaler().fit_transform(y) '''Next we define the pca parameters and apply these to the data initially I chose to asses the two main principal components to plot these on a graph we store these principal components in the principals dataframe to which we then add the target variable to obtain the results dataframe which will allow for plotting''' pca_init = PCA(n_components=2) pca = pca_init.fit_transform(x) principals = pd.DataFrame( data=pca, columns=['Principal component 1', 'Principal component 2']) target = pd.DataFrame(data=y, columns=['Target']) result = pd.concat([principals, target], axis=1) '''below we visualise the result dataframe to get a better understanding of our components'''
from functions import openfile from numpy import set_printoptions from functions import savefile from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.ensemble import ExtraTreesClassifier #Name of columns for the database #name:[arrival_date_month meal country market_segment distribution_channel reserved_room_type assigned_room_type customer_type # Open datavase saved with preprocessing alterations db = openfile('data/step03.csv') print(db.shape) # Get the features array = db.values X = array[:, 0:8] Y = array[:, 8] ########################## PCA Method ##################################### # PCA feature extraction pca = PCA(n_components=4) # Select the most important features fit = pca.fit(X) # summarize components print("\nExplained Variance Ratio : %s" % fit.explained_variance_ratio_)
'''This file should be used for cleaning datasets and joining them to make one large pandas dataframe''' import datetime import numpy as np import pandas as pd import functions as f countries = f.openfile("iso3.txt") iso3 = [] for country in countries: append = dict.get(country, 'alpha-3') iso3.append(append) arable = f.openfile("arable land.h5") migration = f.openfile("migration.h5") pop_growth = f.openfile("population growth.h5") rain = f.openfile("rain.h5") temperature = f.openfile("temperature.h5") total_pop = f.openfile("total population.h5") varlist = [arable, temperature, pop_growth, rain, total_pop] dataset = migration for var in varlist: dataset = pd.merge(dataset, var, on=['year', 'country']) dataset.columns = dataset.columns.str.replace('%', '%25') data = pd.DataFrame() for country in iso3: append = dataset.loc[dataset['country'] == str(country)] append = append.sort_values(by='year') append = append.groupby(np.arange(len(append)) // 5).agg({
inj_offset = 0.0 inj_depth = 0.00989188 inj_width = 1.21325 new_lightdata_list = [] time_list = [] raw_flux_list = [] med_flux_list = [] variance_list = [] #Set filter box width filter_size = 80 #Filter size 80 was chosen during the group meeting on September 26th, 2014. for i in kplr_filename_list: lightdata = f.openfile(kplr_id, i) time, flux, flux_err = f.fix_data(lightdata) time_list.append(time) raw_flux_list.append(flux) variance_list.append(flux_err**2) #This is where the injection happens flux = f.raw_injection(inj_period, inj_offset, inj_depth, inj_width, time, flux) #This is where the filtering happens median = f.median_filter(flux, filter_size) med_flux_list.append(flux / median) time = np.concatenate(time_list) raw_flux = np.concatenate(raw_flux_list) med_flux = np.concatenate(med_flux_list) variance = np.concatenate(variance_list)
inj_offset = 0.0 inj_depth = 0.00989188 inj_width = 1.21325 new_lightdata_list = [] time_list = [] raw_flux_list = [] med_flux_list = [] variance_list = [] #Set filter box width filter_size = 80 #Filter size 80 was chosen during the group meeting on September 26th, 2014. for i in kplr_filename_list: lightdata = f.openfile(kplr_id, i) time, flux, flux_err = f.fix_data(lightdata) time_list.append(time) raw_flux_list.append(flux) variance_list.append(flux_err**2) #This is where the injection happens flux = f.raw_injection(inj_period,inj_offset,inj_depth,inj_width,time,flux) #This is where the filtering happens median = f.median_filter(flux, filter_size) med_flux_list.append(flux / median) time = np.concatenate(time_list) raw_flux = np.concatenate(raw_flux_list) med_flux = np.concatenate(med_flux_list) variance = np.concatenate(variance_list) fig1 = plt.figure()
'''this file is used to query the API for the needed data''' import functions as f countries = f.openfile("iso3.txt") iso3 = [] for country in countries: append = dict.get(country, 'alpha-3') iso3.append(append) temperature = f.wbclimate("tas", "year", iso3, export=True, name='temperature') rain = f.wbclimate("pr", "year", iso3, export=True, name='rain') migration = f.wbdataset('SM.POP.NETM', iso3, 1960, 2012, export=True, name='migration') arable_land = f.wbdataset('AG.LND.ARBL.ZS', iso3, 1900, 2012, export=True, name='arable land') pop_growth = f.wbdataset('SP.POP.GROW', iso3, 1900, 2012, export=True, name='population growth')
# -*- coding: utf-8 -*- ########################################################################## # Project: COMP6004 - Machine learning pipeline for data analysis # File: 03-featureExtraction.py # Author: Diego Bueno - [email protected] # Date: 20/04/2021 # Description: Applying feature extraction to step03 of ML pipeline. # ########################################################################## # Maintenance # Author: # Date: # Description: A # ##########################################################################> import numpy as np import pandas as pd from functions import openfile from functions import savefile from functions import convert from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.neighbors import LocalOutlierFactor from sklearn.metrics import mean_absolute_error #calling the function to Load data pre-reading on task 1 print("\nReading the step02 file\n") db = openfile('data/step02.csv') print("\nChecking the current shape of the data:") rows, columns = db.shape
# -*- coding: utf-8 -*- ########################################################################## # Project: COMP6004 - Machine learning pipeline for data analysis # File: 06-visualisation.py # Author: Diego Bueno - [email protected] # Date: 26/04/2021 # Description: Visualising results of the ML pipeline. # ########################################################################## # Maintenance # Author: # Date: # Description: # ##########################################################################> import numpy as np import pandas as pd from functions import openfile import matplotlib.pyplot as plt #calling the function to Load data pre-reading on task 1 print("\nReading the step06 file\n") _df = openfile('data/step06.csv') print("\nChecking the current shape of the data:") rows, columns = _df.shape print(str(rows) + " rows and " + str(columns) + " columns") print("\nBrief summary of data:\n") print(_df.head(12)) lenDf = len(_df.index) if lenDf > 0: jet = plt.get_cmap('jet')
# -*- coding: utf-8 -*- ########################################################################## # Project: COMP6004 - Machine learning pipeline for data analysis # File: 05-post-processing.py # Author: Vanessa Gomes - [email protected] # Date: 20/04/2021 # Description: Libraries used to dimensionality reduction process # ########################################################################## # Maintenance # Author: Diego Bueno # Date:26/04/2021 # Description: Adaption to the final processed data. # ##########################################################################> import pandas as pd import numpy as np from functions import openfile from numpy import set_printoptions from functions import savefile from sklearn.decomposition import PCA from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_classif from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.ensemble import ExtraTreesClassifier from sklearn.linear_model import LinearRegression from sklearn.preprocessing import PolynomialFeatures from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error