コード例 #1
0
def transform_values(data, n_lags, n_series, dim):
    reframed = utils.series_to_supervised(data, n_lags, n_series)

    # wall = 200 - (n_lags - 1)
    wall = len(reframed) - 25 + n_series

    values = reframed.values
    n_features = data.shape[1]
    n_obs = n_lags * n_features

    cols = ['var1(t)']
    cols += ['var1(t+%d)' % (i) for i in range(1, n_series)]
    y_o = reframed[cols].values

    train_X, train_y = values[:wall, :n_obs], y_o[:wall, -n_series:]
    test_X, test_y = values[wall:, :n_obs], y_o[wall:, -n_series:]

    if (dim):
        train_X = train_X.reshape((train_X.shape[0], n_lags, n_features))
        test_X = test_X.reshape((test_X.shape[0], n_lags, n_features))
    return train_X, test_X, train_y, test_y
コード例 #2
0
# plt.plot(dataset)
# plt.xlabel('time(ms)')
# plt.subplot(2,1,2)
# plt.semilogy(Pxx_den)
# plt.xlabel('frequency (hz)')
#plt.xlim(0,100)

from utils import series_to_supervised, fit_lstm, forecast_lstm, make_forecasts

# Series to supervised
num_lookback = 200
num_predict = 10

# n_in is the number of samples for "input" and n_out is the number for "output", or prediction
supervised_dataset = series_to_supervised(dataset,
                                          n_in=num_lookback,
                                          n_out=num_predict)
sup_ds = supervised_dataset.values

# Scale
row_mean = sup_ds.mean(axis=1, keepdims=True)
row_max = sup_ds.max(axis=1, keepdims=True)
row_min = sup_ds.min(axis=1, keepdims=True)

scl_sup_ds = (sup_ds - row_mean) / (row_max - row_min)

# Train test split
N_train = int(0.8 * np.size(scl_sup_ds, 0))

scaled_train = scl_sup_ds[0:N_train, :]
scaled_test = scl_sup_ds[N_train:, :]
コード例 #3
0
# %%%%%Rede Gating - Perceptron%%%%%%%%%%%%%%%%%%%%%%%%%
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Xtr - entrada de treinamento
# Ytr - saida de treinamento
# Wg - rede gating
# W - especialistas

filename = os.path.realpath(
    os.path.join(os.getcwd(),
                 os.path.dirname('treinamento.txt'))) + '/treinamento-1.txt'
series = pandas.read_csv(filename, header=None)

if __name__ == "__main__":

    D = series_to_supervised(series, 21).values
    k = 4

    folded_partitions = k_fold(D, k)

    for partition in folded_partitions:
        Dtr = folded_partitions[partition][0]
        Dv = folded_partitions[partition][1]

        Xtr = Dtr[:, 0:-1]
        Ytr = Dtr[:, -1].reshape(Xtr.shape[0], 1)

        Xv = Dv[:, 0:-1]
        Yv = Dv[:, -1].reshape(Xv.shape[0], 1)

        m = 6
コード例 #4
0
plt.plot(time,LFP[:,0])
plt.plot(time,LFP_filt)
plt.xlabel('time(ms)')
plt.subplot(2,1,2)
plt.semilogy(f,Pxx_den)
plt.xlabel('frequency (hz)')
plt.show()

############# Process data for Neural Network ##############

# Series to supervised
num_lookback = 100
num_predict = 15

# n_in is the number of samples for "input" and n_out is the number for "output", or prediction
supervised_dataset = series_to_supervised(LFP_filt[:,0:1],n_in=num_lookback,n_out=num_predict)
sup_ds_filt = supervised_dataset.values

supervised_dataset = series_to_supervised(LFP[:,0:1],n_in=num_lookback,n_out=num_predict)
sup_ds_raw = supervised_dataset.values


# train    
model_lstm = fit_lstm(sup_ds_filt, num_predict, 10, nb_epoch = 1, n_neurons=1000)
model_lstm.summary()

# Make forecasts on test data
scaled_forecasts = make_forecasts(model_lstm, 1, sup_ds_filt, num_predict)

# Make persistence forecasts
y_pers_test = np.transpose(np.tile(sup_ds_filt[:,-num_predict-1], (num_predict,1)))
コード例 #5
0
    def __init__(self, csv_file: str = '../data/daily_MSFT.csv', use_keras: bool = False,
                 index_of_plotted_feature: int = 0, num_of_previous_days: int = 7, num_of_future_days: int = 3,
                 num_of_hidden_neurons: int = 256, train_percentage: int = 80, bias_term: int = 1):
        """
        Constructor parses the CSV file and initializes the training and test data

        Parameters
        ----------
        csv_file: str
            The path to the CSV file (from AlphaVantage API) to be parsed
       use_keras: bool
            If true, the model will be a "standard" neural network implemented using the Keras API, if false,
            the model will be an ELM
        index_of_plotted_feature: int
            The class after training and predicting generates an array that can be plotted. This number is between 0 and 4.
            0: Open, 1: High, 2: Low, 3: Close, 4: Volume. Note: 4 (volume) is currently not supported.
        num_of_previous_days: int
            The number of previous days needed in order to make a single prediction
         num_of_future_days: int
            Given num_of_previous_days, we predict num_of_future_days number of future days
         num_of_hidden_neurons: int
            Number of neurons in the hidden layer
        train_percentage: int
            Number between 0 and 100 that represents the percentage of the CSV file data that will be training data
            The remainder will be test data
        bias_term: int = 1
            Column that is prepended to the X data matrix (bias term)
        """
        df = pd.read_csv(csv_file)  # By default header will be read from file
        # print('Head of data frame: \n' + str(df.head()))
        # print('Dimensions of data frame (row x col)' + str(df.shape))
        self.index_of_plotted_feature = index_of_plotted_feature
        self.plotted_feature_str = \
            {0: 'Open', 1: 'High', 2: 'Low', 3: 'Close', 4: 'Volume'}[self.index_of_plotted_feature]
        # Stores the bias (an integer)
        self.bias = bias_term
        self.use_keras = use_keras
        self.model_type_str = None
        self.num_of_rows_in_csv = int(df.shape[0])
        # Assume 4 features because volume is ignored
        self.num_features = 4
        self.num_hidden_layer_neurons = num_of_hidden_neurons
        self.num_prev_timesteps = num_of_previous_days
        self.num_future_timesteps = num_of_future_days
        # Stores the amount of columns in a row that are needed to make a future prediction
        self.num_prev_attributes = self.num_prev_timesteps * self.num_features
        # Stores the amount of columns in a row that represent a future prediction
        self.num_future_attributes = self.num_future_timesteps * self.num_features
        # Initialize a NumPy array of values that will store our stock data
        self.df_values = df[['open', 'high', 'low', 'close']].values
        # Frame our data as rolling window series prediction using series_to_supervised() method
        self.reframed_x_y = utils.series_to_supervised(self.df_values, self.num_prev_timesteps,
                                                       self.num_future_timesteps)
        # Convert the Pandas DataFrame to NumPy array
        self.x_y_values = self.reframed_x_y.values
        self.plotable_dates = utils.convert_to_matplot_dates(df)
        self.plotable_dates = np.reshape(self.plotable_dates, newshape=(-1, 1)).copy()
        self.reframed_dates = utils.series_to_supervised(self.plotable_dates, self.num_prev_timesteps,
                                                         self.num_future_timesteps)
        # Initialize variables that will store the Mean Squared Error
        self.mse_train_cost = -1
        self.mse_test_cost = -1
        # All of the variables that have the word "plotable" are standard python lists that contain
        # Values of a specific attribute (for example "Close") as specified by index_of_plotted_feature
        self.plotable_y_train_real = None
        self.plotable_y_train_pred = None
        self.plotable_y_test_real = None
        self.plotable_y_test_pred = None
        self.plotable_y_future_pred = None
        # The amount of rows in our matrix
        self.data_size = self.x_y_values.shape[0]
        # Separate our data into training and test sets
        self.training_set_size = int((train_percentage / 100) * self.data_size)
        self.test_set_size = int(self.data_size - self.training_set_size)
        print('Rolling window model: Training set size: ' + str(self.training_set_size))
        print('Rolling window model: Test set size: ' + str(self.test_set_size))
        # Below we store integers that matplotlib uses for the plot_date() method so we can plot
        # Dates on the X-axis in a neat way
        self.train_dates = self.reframed_dates.values[self.test_set_size:, -1:].flatten()
        self.test_dates = self.reframed_dates.values[:self.test_set_size, -1:].flatten()
        self.model = None

        # In our y, each row contains more than 1 time step worth of data (due to rolling window prediction)
        # However it makes sense only to plot 1 y in each row
        # Therefore we only plot the last prediction of y given a sequence of values
        # For example if num_future_timesteps is 5, then from each row plot the selected feature at t+4
        # (Since then our y would contain predictions for t, t+1, t+2, t+3, t+4)

        # If we predict 2 future days given 1 day, then one row in  self.x_y_values would have the following format:
        # [Open(t-1), High(t-1), Low(t-1), Close(t-1), Open(t), High(t), Low(t), Close(t), Open(t+1), High(t+1), Low(t+1), Close(t+1)]
        # self.num_prev_attributes in this example is 4, because the first 4 values in the row represent the  data needed to predict the future
        # self.num_future_attributes would be 8, because the last 8 values represent the all the future data
        if self.use_keras:
            self.model_type_str = 'Keras'
            self.x_tr = self.x_y_values[self.test_set_size:, :self.num_prev_attributes]
            self.x_te = self.x_y_values[:self.test_set_size, :self.num_prev_attributes]
            self.y_tr = self.x_y_values[self.test_set_size:, -self.num_future_attributes:]
            self.y_te = self.x_y_values[:self.test_set_size, -self.num_future_attributes:]
            self.plotable_y_train_real = self.y_tr[:, -(self.num_features + self.index_of_plotted_feature)].flatten()
            self.plotable_y_test_real = self.y_te[:, -(self.num_features + self.index_of_plotted_feature)].flatten()
            self.input_shape = (self.num_prev_attributes,)
            self.model = Sequential()
            self.model.add(Dense(self.num_hidden_layer_neurons, input_shape=self.input_shape, activation='linear'))
            self.model.add(Dense(self.num_future_attributes))
            self.model.compile(loss='mse', optimizer='adam')
            print('Created a keras model for disjoint X and Y stock data, summary: ')
            self.model.summary()
        else:
            self.model_type_str = 'ELM'
            self.x_tr = np.mat(self.x_y_values[self.test_set_size:, :self.num_prev_attributes])
            self.x_te = np.mat(self.x_y_values[:self.test_set_size, :self.num_prev_attributes])
            self.y_tr = np.mat(self.x_y_values[self.test_set_size:, -self.num_future_attributes:])
            self.y_te = np.mat(self.x_y_values[:self.test_set_size, -self.num_future_attributes:])
            self.plotable_y_train_real = \
                self.y_tr[:, -(self.num_features + self.index_of_plotted_feature)].flatten().tolist()[0]
            self.plotable_y_test_real = \
                self.y_te[:, -(self.num_features + self.index_of_plotted_feature)].flatten().tolist()[0]
            # Add bias term of ones to test X and train X
            self.x_tr = np.concatenate((np.ones(shape=(self.x_tr.shape[0], 1)) * self.bias, self.x_tr), axis=1)
            self.x_te = np.concatenate((np.ones(shape=(self.x_te.shape[0], 1)) * self.bias, self.x_te), axis=1)
            # Our beta will be a weight matrix between the hidden and output layer
            self.input_layer_weights = np.mat(
                utils.rand_init(shape=(self.num_hidden_layer_neurons, self.num_prev_attributes + 1)))
            self.beta = None
    def __init__(self,
                 csv_file: str = '../data/daily_MSFT.csv',
                 index_of_plotted_feature: int = 0,
                 num_of_previous_days: int = 7,
                 num_of_future_days: int = 3,
                 num_of_hidden_neurons: int = 256,
                 train_percentage: int = 80):
        self.dataset = read_csv(csv_file, header=0)
        values = self.dataset[['open', 'high', 'low', 'close',
                               'volume']].values
        values = values.astype('float32')
        self.num_of_prev_timesteps = num_of_previous_days
        self.num_of_future_timesteps = num_of_future_days
        self.num_features = 5
        self.num_prev_objs = self.num_features * self.num_of_prev_timesteps
        self.num_future_objs = self.num_features * self.num_of_future_timesteps
        # open = 0, high = 1, low = 2, close = 3, volume = 4
        self.index_of_plotted_feature = index_of_plotted_feature  # The feature that shall be plotted (all will be predicted)
        self.plotted_feature_str = \
            {0: 'Open', 1: 'High', 2: 'Low', 3: 'Close', 4: 'Volume'}[self.index_of_plotted_feature]
        self.num_rolling_days_ahead = 30

        self.unscaled_values = values.copy()
        # normalize features
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.scaled = self.scaler.fit_transform(values)

        # frame as supervised learning
        self.reframed = utils.series_to_supervised(
            self.scaled, self.num_of_prev_timesteps,
            self.num_of_future_timesteps)
        reframed_unscaled = utils.series_to_supervised(
            self.unscaled_values, self.num_of_prev_timesteps,
            self.num_of_future_timesteps)
        self.unscaled_values = reframed_unscaled.values
        self.plotable_dates = utils.convert_to_matplot_dates(self.dataset)
        self.plotable_dates = np.reshape(self.plotable_dates,
                                         newshape=(-1, 1)).copy()
        self.reframed_dates = utils.series_to_supervised(
            self.plotable_dates, self.num_of_prev_timesteps,
            self.num_of_future_timesteps)
        self.scaled_values = self.reframed.values  # Extract numpy array from a pandas DataFrame

        self.plotable_y_train_real = None
        self.plotable_y_train_pred = None
        self.plotable_y_test_real = None
        self.plotable_y_test_pred = None
        self.plotable_y_future_pred = None

        self.mse_train_cost = -1
        self.mse_test_cost = -1

        self.last_observations = self.scaled_values[0, -self.num_prev_objs:]

        self.data_size = self.reframed.shape[0]

        # Training set is 80% of the examples
        self.training_set_size = int((train_percentage / 100) * self.data_size)
        self.test_set_size = self.data_size - self.training_set_size

        self.train_dates = self.reframed_dates.values[self.test_set_size:,
                                                      -1:].flatten()
        self.test_dates = self.reframed_dates.values[:self.test_set_size,
                                                     -1:].flatten()

        # split into input and outputs
        # Training set contains the older (time-wise) part of data
        self.training_set_x_y = self.scaled_values[self.test_set_size:, :]
        # Test set has the newer (time-wise) part of data
        self.test_set_x_y = self.scaled_values[:self.test_set_size, :]
        print(self.dataset.head())
        self.unscaled_train_x_y = self.unscaled_values[self.test_set_size:, :]
        self.unscaled_test_x_y = self.unscaled_values[:self.test_set_size, :]
        unscaled_train_y, unscaled_test_y = self.unscaled_train_x_y[:, -self.
                                                                    num_future_objs:], self.unscaled_test_x_y[:,
                                                                                                              -self
                                                                                                              .
                                                                                                              num_future_objs:]

        self.train_x, self.train_y = self.training_set_x_y[:, :self.
                                                           num_prev_objs], self.training_set_x_y[:,
                                                                                                 -self
                                                                                                 .
                                                                                                 num_future_objs:]
        self.test_x, self.test_y = self.test_set_x_y[:, :self.
                                                     num_prev_objs], self.test_set_x_y[:,
                                                                                       -self
                                                                                       .
                                                                                       num_future_objs:]

        self.plotable_y_train_real = unscaled_train_y[:, -(
            self.num_features + self.index_of_plotted_feature)].flatten()
        self.plotable_y_test_real = unscaled_test_y[:, -(
            self.num_features + self.index_of_plotted_feature)].flatten()

        # reshape input to be 3D [samples, timesteps, features] as expected by LSTM
        self.train_x = self.train_x.reshape(self.train_x.shape[0],
                                            self.num_of_prev_timesteps,
                                            self.num_features)
        self.test_x = self.test_x.reshape(self.test_x.shape[0],
                                          self.num_of_prev_timesteps,
                                          self.num_features)
        self.last_observations_reshaped = self.last_observations.reshape(
            1, self.num_of_prev_timesteps, self.num_features)

        print('LSTM Training input size: ' + str(self.train_x.shape) + '\n' +
              'Training output size: ' + str(self.train_y.shape) + '\n' +
              'Test input size: ' + str(self.test_x.shape) + '\n' +
              'Test output size: ' + str(self.test_y.shape))
        print('test_set_x_y: \n' + str(self.test_set_x_y))

        # Create LSTM model
        self.model = Sequential()
        self.model.add(
            LSTM(num_of_hidden_neurons,
                 input_shape=(self.train_x.shape[1], self.train_x.shape[2])))
        self.model.add(Dense(self.num_future_objs))
        self.model.compile(loss='mae', optimizer='adam')
        self.model.summary()
    def __init__(self, csv_file: str = '../data/daily_MSFT.csv'):
        self.dataset = read_csv(csv_file, header=0)
        print('CSV columns: ' + str(self.dataset.columns.tolist()))
        values = self.dataset[['open', 'high', 'low', 'close',
                               'volume']].values
        values = values.astype('float32')
        self.num_of_prev_timesteps = 7
        self.num_of_future_timesteps = 2
        self.num_features = 5
        self.num_prev_objs = self.num_features * self.num_of_prev_timesteps
        self.num_future_objs = self.num_features * self.num_of_future_timesteps
        # open = 0, high = 1, low = 2, close = 3, volume = 4
        self.index_of_plotted_feature = 0  # The feature that shall be plotted (all will be predicted)
        self.num_rolling_days_ahead = 30

        # normalize features
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        print('Shape of values before transforming: ' + str(values.shape))
        self.scaled = self.scaler.fit_transform(values)

        # frame as supervised learning
        self.reframed = utils.series_to_supervised(
            self.scaled, self.num_of_prev_timesteps,
            self.num_of_future_timesteps)

        print('reframed: \n' + str(self.reframed.head()))
        self.scaled_values = self.reframed.values  # Extract numpy array from a pandas DataFrame

        print('Dim of scaled_values: ' + str(self.scaled_values.shape))
        self.last_observations = self.scaled_values[:self.
                                                    num_of_future_timesteps,
                                                    -self.num_prev_objs:]
        print('last_observations: \n' + str(self.last_observations))

        self.data_size = self.reframed.shape[0]
        print('Data size: ' + str(self.data_size))
        # Training set is 80% of the examples
        self.training_set_size = int(0.8 * self.data_size)
        self.test_set_size = self.data_size - self.training_set_size
        # split into input and outputs
        # Training set contains the older (time-wise) part of data
        self.training_set_x_y = self.scaled_values[self.test_set_size:, :]
        # Test set has the newer (time-wise) part of data
        self.test_set_x_y = self.scaled_values[:self.test_set_size, :]
        print(self.dataset.head())

        self.train_x, self.train_y = self.training_set_x_y[:, :self.
                                                           num_prev_objs], self.training_set_x_y[:,
                                                                                                 -self
                                                                                                 .
                                                                                                 num_features:]
        self.test_x, self.test_y = self.test_set_x_y[:, :self.
                                                     num_prev_objs], self.test_set_x_y[:,
                                                                                       -self
                                                                                       .
                                                                                       num_features:]
        # reshape input to be 3D [samples, timesteps, features] as expected by LSTM
        self.train_x = self.train_x.reshape(self.train_x.shape[0],
                                            self.num_of_prev_timesteps,
                                            self.num_features)
        self.test_x = self.test_x.reshape(self.test_x.shape[0],
                                          self.num_of_prev_timesteps,
                                          self.num_features)
        self.last_observations_reshaped = self.last_observations.reshape(
            self.last_observations.shape[0], self.num_of_prev_timesteps,
            self.num_features)

        print('Training input size: ' + str(self.train_x.shape) + '\n' +
              'Training output size: ' + str(self.train_y.shape) + '\n' +
              'Test input size: ' + str(self.test_x.shape) + '\n' +
              'Test output size: ' + str(self.test_y.shape))
        print('test_set_x_y: \n' + str(self.test_set_x_y))

        # Create LSTM model
        self.model = Sequential()
        self.model.add(
            LSTM(512,
                 input_shape=(self.train_x.shape[1], self.train_x.shape[2])))
        self.model.add(Dense(self.num_features))
        self.model.compile(loss='mae', optimizer='adam')
        self.model.summary()
コード例 #8
0
    for fold in folds:
        train_folds = [folds[x] for x in range(1, k + 1) if x != fold]
        Dtr = np.concatenate(train_folds)
        Dv = folds[fold]

        folded_partitions[fold] = [Dtr, Dv]

    return folded_partitions


if __name__ == '__main__':

    filename = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname('treinamento.txt'))) + '/treinamento-1.txt'
    series = pandas.read_csv(filename, header=None)

    D = series_to_supervised(series, 15).values
    k = 4

    folded_partitions = k_fold(D,k)

    for partition in folded_partitions:
        Dtr = folded_partitions[partition][0]
        Dv = folded_partitions[partition][1]

        Xtr = Dtr[:, 0:-1]
        Ytr = Dtr[:, -1].reshape(Xtr.shape[0], 1)

        Xv = Dv[:, 0:-1]
        Yv = Dv[:, -1].reshape(Xv.shape[0], 1)

        print("Xtr", partition, ": ", Xtr)
コード例 #9
0
SEQ_LEN = 10
BATCH = 1
EPOCH = 100

# In[30]:

time_points = np.linspace(start=0, stop=1, num=1000, dtype=np.float32)
time_points = np.reshape(time_points, (-1, 1))

# In[31]:

# data_seq_generator = rnn_minibatch_sequencer(raw_data=time_points, batch_size=BATCH, sequence_size=SEQ_LEN,
#                                             nb_epochs=EPOCH)

data_seq_generator = series_to_supervised(time_points, n_in=SEQ_LEN,
                                          n_out=1).values
print(data_seq_generator)
x = data_seq_generator[:, 0:SEQ_LEN].reshape(-1, SEQ_LEN, 1)
x = x[0:len(x) - (len(x) % BATCH)]

y = data_seq_generator[:, SEQ_LEN]
y = y[0:len(y) - (len(y) % BATCH)]

print(x.shape)
print(y.shape)

# In[40]:

from keras import Sequential
from keras.layers import LSTM, Dense
コード例 #10
0
def build_and_train_td_file_level(horizon_param, project_param,
                                  project_files_param, regressor_param,
                                  ground_truth_param):
    """
    Build file-level TD forecasting models and return forecasts for an horizon specified by the user.
    Arguments:
        horizon_param: The forecasting horizon up to which forecasts will be produced.
        project_param: The project for which the forecasts will be produced.
        project_files_param: The number of files for which the forecasts will be produced.
        regressor_param: The regressor models that will be used to produce forecasts.
        ground_truth_param: If the model will return also ground truth values or not.
    Returns:
        A dictionary containing fike-level forecasted values (and ground thruth
        values if ground_truth_param is set to yes) of the selected project, for
        a number of files specified by the user and for each intermediate step
        ahead up to the specified horizon.
    """

    # Read file-level dataset
    try:
        dataset_td_file = pd.read_csv('data/%s_class.csv' % project_param,
                                      sep=";")
    except FileNotFoundError as e:
        if debug:
            print(e)
        return -2

    # selecting indicators that will be used as model variables
    metrics_td = [
        'code_smells', 'ncloc', 'complexity', 'duplicated_blocks',
        'total_principal'
    ]

    # Select sliding window length
    window_size = 2

    # Compute change proneness and TD change proneness for each file
    files_change_metrics_df = pd.DataFrame()
    for file_id in dataset_td_file['class_id'].unique().tolist():
        # create temporary file dataframe
        temp_file_df = dataset_td_file[dataset_td_file['class_id'] == file_id]

        temp_file_metr_dict = {}
        temp_file_name = temp_file_df.class_name.iloc[0]
        temp_file_metr_dict['file_id'] = file_id
        temp_file_metr_dict['file_name'] = temp_file_name
        temp_file_metr_dict['versions'] = temp_file_df.shape[0]
        temp_file_metr_dict[
            'td_of_last_version'] = temp_file_df.total_principal.iloc[-1]
        temp_file_metr_dict[
            'complexity_of_last_version'] = temp_file_df.complexity.iloc[-1]

        # compute number of changes in LOC across versions of a file
        ncloc_has_changed_list = temp_file_df.ncloc == temp_file_df.ncloc.shift(
        )
        ncloc_has_changed_list = [
            1 if i == False else 0 for i in ncloc_has_changed_list
        ]
        file_df_changes = sum(ncloc_has_changed_list)
        temp_file_metr_dict['number_of_changes'] = file_df_changes

        # compute LOC Change Proneness of a file
        file_df_cp = (file_df_changes / temp_file_df.shape[0])
        temp_file_metr_dict['change_proneness_(CP)'] = file_df_cp

        # compute number of changes in TD across versions of a file
        td_has_changed_list = temp_file_df.total_principal == temp_file_df.total_principal.shift(
        )
        td_has_changed_list = [
            1 if i == False else 0 for i in td_has_changed_list
        ]
        file_df_changes_td = sum(td_has_changed_list)
        temp_file_metr_dict['number_of_td_changes'] = file_df_changes_td

        # compute TD Change Proneness of a file
        file_df_cp_td = (file_df_changes_td / temp_file_df.shape[0])
        temp_file_metr_dict['change_proneness_td_(CP-TD)'] = file_df_cp_td

        # compute number of changes in complexity across versions of a file
        complexity_has_changed_list = temp_file_df.complexity == temp_file_df.complexity.shift(
        )
        complexity_has_changed_list = [
            1 if i == False else 0 for i in complexity_has_changed_list
        ]
        file_df_complexity_changes = sum(complexity_has_changed_list)
        temp_file_metr_dict[
            'number_of_complexity_changes'] = file_df_complexity_changes

        # compute complexity Change Proneness of a file
        file_df_CP_complexity = (file_df_complexity_changes /
                                 temp_file_df.shape[0])
        temp_file_metr_dict[
            'change_proneness_complexity_(CP-COMP)'] = file_df_CP_complexity

        # compute average size of changes in LOC across versions of a file
        ncloc_changes_volume_list = temp_file_df['ncloc'].diff(periods=1)
        ncloc_changes_volume_list.fillna(0, inplace=True)
        file_df_expected_changes = sum(ncloc_changes_volume_list) / (
            temp_file_df.shape[0] - 1)
        temp_file_metr_dict[
            'expected_size_change_(ED-LOC)'] = file_df_expected_changes

        # compute average size of changes in TD across versions of a file
        td_changes_volume_list = temp_file_df['total_principal'].diff(
            periods=1)
        td_changes_volume_list.fillna(0, inplace=True)
        file_df_expected_td_changes = sum(td_changes_volume_list) / (
            temp_file_df.shape[0] - 1)
        temp_file_metr_dict[
            'expected_td_change_(ED-TD)'] = file_df_expected_td_changes

        # compute average size of changes in complexity across versions of a file
        complexity_changes_volume_list = temp_file_df['complexity'].diff(
            periods=1)
        complexity_changes_volume_list.fillna(0, inplace=True)
        file_df_expected_complexity_changes = sum(
            complexity_changes_volume_list) / (temp_file_df.shape[0] - 1)
        temp_file_metr_dict[
            'expected_complexity_change_(ED-COMP)'] = file_df_expected_complexity_changes

        temp_file_metr_df = pd.DataFrame.from_records(
            [temp_file_metr_dict],
            index='file_id',
            columns=temp_file_metr_dict.keys())
        files_change_metrics_df = files_change_metrics_df.append(
            temp_file_metr_df)

    # Sort files by Change Proneness (CP)
    files_change_metrics_df.sort_values(by=['change_proneness_(CP)'],
                                        ascending=False,
                                        inplace=True)

    # Keep only first n files, where n = project_files_param
    files_change_metrics_df = files_change_metrics_df.head(project_files_param)

    # Initialise variables
    dict_result = {
        'parameters': {
            'project': project_param,
            'files': project_files_param,
            'horizon': horizon_param,
            'regressor': regressor_param,
            'ground_truth': ground_truth_param
        }
    }
    list_forecasts = []
    list_metrics = []
    list_ground_truth = []

    # Compute forecasts for each file
    for index, file_instance in files_change_metrics_df.iterrows():
        if debug:
            print(
                '=========================== File: %s ============================'
                % file_instance['file_name'])
        temp_file_df = dataset_td_file.loc[dataset_td_file['class_id'] ==
                                           index]
        temp_file_df.reset_index(inplace=True, drop=True)

        temp_dataset_td_file = temp_file_df[metrics_td]

        # Fill list with metrics of files
        temp_metrics_dict = {
            file_instance['file_name']:
            pd.DataFrame(file_instance).T.to_dict('records')[0]
        }
        list_metrics.append(temp_metrics_dict)

        temp_list_forecasts = []

        # Make forecasts using the Direct approach, i.e. train separate ML models for each forecasting horizon
        for intermediate_horizon in range(1, horizon_param + 1):
            if debug:
                print(
                    '=========================== Horizon: %s ============================'
                    % intermediate_horizon)

            # Add time-shifted prior and future period
            data = series_to_supervised(temp_dataset_td_file, n_in=window_size)

            # Append dependend variable column with value equal to total_principal of the target horizon's version
            data['forecasted_total_principal'] = data[
                'total_principal(t)'].shift(-intermediate_horizon)
            data = data.drop(data.index[-intermediate_horizon:])

            # Remove TD as independent variable
            data = data.drop(columns=[
                'total_principal(t-%s)' % (i)
                for i in range(window_size, 0, -1)
            ])

            # Define independent and dependent variables
            x_array = data.iloc[:, data.
                                columns != 'forecasted_total_principal'].values
            y_array = data.iloc[:, data.columns ==
                                'forecasted_total_principal'].values

            # Deploy model
            # Assign version counter
            version_counter = len(temp_dataset_td_file) + intermediate_horizon
            # Define X to to deploy model for real forecasts
            x_real = series_to_supervised(temp_dataset_td_file,
                                          n_in=window_size,
                                          dropnan=False)
            x_real = x_real.drop(columns=[
                'total_principal(t-%s)' % (i)
                for i in range(window_size, 0, -1)
            ])
            x_real = x_real.iloc[-1, :].values
            x_real = x_real.reshape(1, -1)
            # Make real forecasts
            regressor = create_regressor(regressor_param, x_array, y_array)
            if regressor is -1:
                return -1
            y_pred = regressor.predict(x_real)

            # Fill list with forecasts
            temp_forecasts_dict = {
                'version': version_counter,
                'value': float(y_pred[0])
            }
            temp_list_forecasts.append(temp_forecasts_dict)

        # Fill list with forecasts
        temp_file_forecasts_dict = {
            file_instance['file_name']: temp_list_forecasts
        }
        list_forecasts.append(temp_file_forecasts_dict)

        # If the model will return also ground truth values
        if ground_truth_param == 'yes':
            temp_list_ground_truth = []
            # Fill dataframe with ground thruth
            for intermediate_horizon in range(
                    0, len(temp_dataset_td_file['total_principal'])):
                temp_ground_truth_dict = {
                    'version':
                    intermediate_horizon + 1,
                    'value':
                    float(temp_dataset_td_file['total_principal']
                          [intermediate_horizon])
                }
                temp_list_ground_truth.append(temp_ground_truth_dict)
            # Fill list with files
            temp_ground_truth_dict = {
                file_instance['file_name']: temp_list_ground_truth
            }
            list_ground_truth.append(temp_ground_truth_dict)

    # Fill results dictionary with change proneness and TD change proneness for each file
    dict_result['change_metrics'] = list_metrics

    # Fill results dictionary with forecasts for each file
    dict_result['forecasts'] = list_forecasts

    # If the model will return also ground truth values
    if ground_truth_param == 'yes':
        # Fill results dictionary with ground thruth
        dict_result['ground_truth'] = list_ground_truth

    if debug:
        print(dict_result)

    return dict_result
コード例 #11
0
def build_and_train_td(horizon_param, project_param, regressor_param,
                       ground_truth_param):
    """
    Build TD forecasting models and return forecasts for an horizon specified by the user.
    Arguments:
        horizon_param: The forecasting horizon up to which forecasts will be produced.
        project_param: The project for which the forecasts will be produced.
        regressor_param: The regressor models that will be used to produce forecasts.
        ground_truth_param: If the model will return also ground truth values or not.
    Returns:
        A dictionary containing forecasted values (and ground thruth values if
        ground_truth_param is set to yes) for each intermediate step ahead up
        to the specified horizon.
    """

    # selecting indicators that will be used as model variables
    metrics_td = [
        'code_smells', 'ncloc', 'complexity', 'duplicated_blocks',
        'sqale_index', 'reliability_remediation_effort',
        'security_remediation_effort'
    ]
    # Select sliding window length
    window_size = 2

    # Read dataset
    try:
        dataset_td = pd.read_csv('data/%s.csv' % project_param,
                                 sep=";",
                                 usecols=metrics_td)
    except FileNotFoundError as e:
        if debug:
            print(e)
        return -2

    dataset_td['total_principal'] = dataset_td[
        'reliability_remediation_effort'] + dataset_td[
            'security_remediation_effort'] + dataset_td['sqale_index']
    dataset_td = dataset_td.drop(columns=[
        'sqale_index', 'reliability_remediation_effort',
        'security_remediation_effort'
    ])

    # Initialise variables
    dict_result = {
        'parameters': {
            'project': project_param,
            'horizon': horizon_param,
            'regressor': regressor_param,
            'ground_truth': ground_truth_param
        }
    }
    list_forecasts = []
    list_ground_truth = []

    # Make forecasts using the Direct approach, i.e. train separate ML models for each forecasting horizon
    for intermediate_horizon in range(1, horizon_param + 1):
        if debug:
            print(
                '=========================== Horizon: %s ============================'
                % intermediate_horizon)

        # Add time-shifted prior and future period
        data = series_to_supervised(dataset_td, n_in=window_size)

        # Append dependend variable column with value equal to total_principal of the target horizon's version
        data['forecasted_total_principal'] = data['total_principal(t)'].shift(
            -intermediate_horizon)
        data = data.drop(data.index[-intermediate_horizon:])

        # Remove TD as independent variable
        data = data.drop(columns=[
            'total_principal(t-%s)' % (i) for i in range(window_size, 0, -1)
        ])

        # Define independent and dependent variables
        x_array = data.iloc[:, data.
                            columns != 'forecasted_total_principal'].values
        y_array = data.iloc[:, data.columns ==
                            'forecasted_total_principal'].values

        # Deploy model
        # Assign version counter
        version_counter = len(dataset_td) + intermediate_horizon
        # Define X to to deploy model for real forecasts
        x_real = series_to_supervised(dataset_td,
                                      n_in=window_size,
                                      dropnan=False)
        x_real = x_real.drop(columns=[
            'total_principal(t-%s)' % (i) for i in range(window_size, 0, -1)
        ])
        x_real = x_real.iloc[-1, :].values
        x_real = x_real.reshape(1, -1)
        # Make real forecasts
        regressor = create_regressor(regressor_param, x_array, y_array)
        if regressor is -1:
            return -1
        y_pred = regressor.predict(x_real)

        # Fill dataframe with forecasts
        temp_dict = {'version': version_counter, 'value': float(y_pred[0])}
        list_forecasts.append(temp_dict)

    # Fill results dictionary with forecasts
    dict_result['forecasts'] = list_forecasts

    # If the model will return also ground truth values
    if ground_truth_param == 'yes':
        # Fill dataframe with ground thruth
        for intermediate_horizon in range(0,
                                          len(dataset_td['total_principal'])):
            temp_dict = {
                'version': intermediate_horizon + 1,
                'value':
                float(dataset_td['total_principal'][intermediate_horizon])
            }
            list_ground_truth.append(temp_dict)
        # Fill results dictionary with ground thruth
        dict_result['ground_truth'] = list_ground_truth

    if debug:
        print(dict_result)

    return dict_result