Example #1
0
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
import functions as f

# import data
df = f.openfile("data.h5")

periods = df.year.unique()

fig = plt.figure(figsize=(20, 20))

for c, num in zip(periods, range(1, 12)):

    # subset every period
    features = df[df['year'] == c].drop(columns=['year', 'country'])

    ax = fig.add_subplot(5, 3, num)

    # perform elbow method
    Error = []
    for i in range(1, 15):
        kmeans = KMeans(n_clusters=i).fit(features)
        Error.append(kmeans.inertia_)

    ax.plot(range(1, 15), Error)
    plt.title('Elbow Method')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')

plt.tight_layout()
import functions as f
import matplotlib.pyplot as plt
import numpy as np
import time as t

t0 = t.clock()

kplr_id = '008191672'
kplr_file = 'kplr008191672-2013011073258_llc.fits'
jdadj, obsobject, lightdata = f.openfile(kplr_id, kplr_file)

time, flux, flux_err = f.fix_data(lightdata)
flux, variance = f.rescale(flux, flux_err)
time -= np.median(time)

depth = 0.00650010001
width = 0.177046694669

period_interval = np.arange(2.00, 8.0, 0.01)
offset_intervals = np.arange(0.00, 7.2, 0.01)

#Change to numpy arrays to optimize.
# z = [[f.sum_chi_squared(flux, f.box(p,o,depth,width,time),variance) for o in offset_intervals]
# for p in period_interval]

z = []
for p in period_interval:
    line = []
    for o in offset_intervals:
        if o < p:
            line.append(
import functions as f
import matplotlib.pyplot as plt
import numpy as np
import time as t

t0 = t.clock()

kplr_id = '008191672'
kplr_file = 'kplr008191672-2013011073258_llc.fits'
jdadj, obsobject, lightdata = f.openfile(kplr_id, kplr_file)

time, flux, flux_err = f.fix_data(lightdata)
flux, variance = f.rescale(flux, flux_err)
time -= np.median(time)

depth = 0.00650010001
width = 0.177046694669

period_interval = np.arange(2.00, 8.0, 0.01)
offset_intervals = np.arange(0.00, 7.2, 0.01)

#Change to numpy arrays to optimize.
# z = [[f.sum_chi_squared(flux, f.box(p,o,depth,width,time),variance) for o in offset_intervals]
# for p in period_interval]

z = []
for p in period_interval:
	line = []
	for o in offset_intervals:
		if o < p:
			line.append(f.sum_chi_squared(flux, f.box(p,o,depth,width,time), variance))
Example #4
0
import functions as f
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

data = f.openfile('data.h5')
'''reading in data, setting target values as y and variables as x
applying the string conversion of years (as they are more categorical rather than numerical in this instance)
creating dummy variables and standardising the independent variables (aka x) and dependent or target variable y'''

y = np.array(data['Net migration'])
x = data.drop('Net migration', axis=1)
x['year'] = x['year'].apply(lambda x: str(x.year))
x = pd.get_dummies(x)
features_list = list(x.columns)
x = StandardScaler().fit_transform(x)
y = y.reshape(-1, 1)
y = StandardScaler().fit_transform(y)
'''Next we define the pca parameters and apply these to the data
initially I chose to asses the two main principal components to plot these on a graph
we store these principal components in the principals dataframe to which we then add the target variable
to obtain the results dataframe which will allow for plotting'''
pca_init = PCA(n_components=2)
pca = pca_init.fit_transform(x)
principals = pd.DataFrame(
    data=pca, columns=['Principal component 1', 'Principal component 2'])
target = pd.DataFrame(data=y, columns=['Target'])
result = pd.concat([principals, target], axis=1)
'''below we visualise the result dataframe to get a better understanding of our components'''
Example #5
0
from functions import openfile
from numpy import set_printoptions
from functions import savefile
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier

#Name of columns for the database
#name:[arrival_date_month	meal	country	market_segment	distribution_channel	reserved_room_type	assigned_room_type	customer_type

# Open datavase saved with preprocessing alterations

db = openfile('data/step03.csv')

print(db.shape)

# Get the features
array = db.values
X = array[:, 0:8]
Y = array[:, 8]

########################## PCA Method #####################################

# PCA feature extraction
pca = PCA(n_components=4)  # Select the most important features
fit = pca.fit(X)
# summarize components
print("\nExplained Variance Ratio : %s" % fit.explained_variance_ratio_)
Example #6
0
'''This file should be used for cleaning datasets and joining them to make one large pandas dataframe'''
import datetime

import numpy as np
import pandas as pd
import functions as f

countries = f.openfile("iso3.txt")
iso3 = []
for country in countries:
    append = dict.get(country, 'alpha-3')
    iso3.append(append)

arable = f.openfile("arable land.h5")
migration = f.openfile("migration.h5")
pop_growth = f.openfile("population growth.h5")
rain = f.openfile("rain.h5")
temperature = f.openfile("temperature.h5")
total_pop = f.openfile("total population.h5")

varlist = [arable, temperature, pop_growth, rain, total_pop]
dataset = migration
for var in varlist:
    dataset = pd.merge(dataset, var, on=['year', 'country'])
dataset.columns = dataset.columns.str.replace('%', '%25')

data = pd.DataFrame()
for country in iso3:
    append = dataset.loc[dataset['country'] == str(country)]
    append = append.sort_values(by='year')
    append = append.groupby(np.arange(len(append)) // 5).agg({
inj_offset = 0.0
inj_depth = 0.00989188
inj_width = 1.21325

new_lightdata_list = []
time_list = []
raw_flux_list = []
med_flux_list = []
variance_list = []

#Set filter box width
filter_size = 80
#Filter size 80 was chosen during the group meeting on September 26th, 2014.

for i in kplr_filename_list:
    lightdata = f.openfile(kplr_id, i)
    time, flux, flux_err = f.fix_data(lightdata)
    time_list.append(time)
    raw_flux_list.append(flux)
    variance_list.append(flux_err**2)
    #This is where the injection happens
    flux = f.raw_injection(inj_period, inj_offset, inj_depth, inj_width, time,
                           flux)
    #This is where the filtering happens
    median = f.median_filter(flux, filter_size)
    med_flux_list.append(flux / median)
time = np.concatenate(time_list)
raw_flux = np.concatenate(raw_flux_list)
med_flux = np.concatenate(med_flux_list)
variance = np.concatenate(variance_list)
inj_offset = 0.0
inj_depth = 0.00989188
inj_width = 1.21325

new_lightdata_list = []
time_list = []
raw_flux_list = []
med_flux_list = []
variance_list = []

#Set filter box width
filter_size = 80
#Filter size 80 was chosen during the group meeting on September 26th, 2014.

for i in kplr_filename_list:
	lightdata = f.openfile(kplr_id, i)
	time, flux, flux_err = f.fix_data(lightdata)
	time_list.append(time)
	raw_flux_list.append(flux)
	variance_list.append(flux_err**2)
	#This is where the injection happens
	flux = f.raw_injection(inj_period,inj_offset,inj_depth,inj_width,time,flux)
	#This is where the filtering happens
	median = f.median_filter(flux, filter_size)
	med_flux_list.append(flux / median)
time = np.concatenate(time_list)
raw_flux = np.concatenate(raw_flux_list)
med_flux = np.concatenate(med_flux_list)
variance = np.concatenate(variance_list)

fig1 = plt.figure()
Example #9
0
'''this file is used to query the API for the needed data'''

import functions as f

countries = f.openfile("iso3.txt")
iso3 = []

for country in countries:
    append = dict.get(country, 'alpha-3')
    iso3.append(append)

temperature = f.wbclimate("tas", "year", iso3, export=True, name='temperature')
rain = f.wbclimate("pr", "year", iso3, export=True, name='rain')
migration = f.wbdataset('SM.POP.NETM',
                        iso3,
                        1960,
                        2012,
                        export=True,
                        name='migration')
arable_land = f.wbdataset('AG.LND.ARBL.ZS',
                          iso3,
                          1900,
                          2012,
                          export=True,
                          name='arable land')
pop_growth = f.wbdataset('SP.POP.GROW',
                         iso3,
                         1900,
                         2012,
                         export=True,
                         name='population growth')
Example #10
0
# -*- coding: utf-8 -*-
##########################################################################
# Project: COMP6004 - Machine learning pipeline for data analysis
# File: 03-featureExtraction.py
# Author: Diego Bueno - [email protected] 
# Date: 20/04/2021
# Description: Applying feature extraction to step03 of ML pipeline.
#
##########################################################################
# Maintenance                            
# Author: 
# Date:  
# Description: A
#
##########################################################################>
import numpy as np
import pandas as pd
from functions import openfile
from functions import savefile
from functions import convert
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import mean_absolute_error
#calling the function to Load data pre-reading on task 1
print("\nReading the step02 file\n")
db = openfile('data/step02.csv')
print("\nChecking the current shape of the data:")
rows, columns = db.shape
Example #11
0
# -*- coding: utf-8 -*-
##########################################################################
# Project: COMP6004 - Machine learning pipeline for data analysis
# File: 06-visualisation.py
# Author: Diego Bueno - [email protected]
# Date: 26/04/2021
# Description: Visualising results of the ML pipeline.
#
##########################################################################
# Maintenance
# Author:
# Date:
# Description:
#
##########################################################################>
import numpy as np
import pandas as pd
from functions import openfile
import matplotlib.pyplot as plt
#calling the function to Load data pre-reading on task 1
print("\nReading the step06 file\n")
_df = openfile('data/step06.csv')
print("\nChecking the current shape of the data:")
rows, columns = _df.shape
print(str(rows) + " rows and " + str(columns) + " columns")
print("\nBrief summary of data:\n")
print(_df.head(12))
lenDf = len(_df.index)
if lenDf > 0:
    jet = plt.get_cmap('jet')
# -*- coding: utf-8 -*-
##########################################################################
# Project: COMP6004 - Machine learning pipeline for data analysis
# File: 05-post-processing.py
# Author: Vanessa Gomes - [email protected]
# Date: 20/04/2021
# Description: Libraries used to dimensionality reduction process
#
##########################################################################
# Maintenance
# Author: Diego Bueno
# Date:26/04/2021
# Description: Adaption to the final processed data.
#
##########################################################################>
import pandas as pd
import numpy as np
from functions import openfile
from numpy import set_printoptions
from functions import savefile
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error