def main(self):
     FE = FeatureEngineering()
     features_train, labels_train, features_test, labels_test, clean_data_frame = FE.main(
     )
     # RandomforestC=self.RFC(features_train, labels_train,features_test, labels_test,clean_data_frame)
     MultinomialNB = self.MNB(features_train, labels_train, features_test,
                              labels_test, clean_data_frame)
class PreprocessedData:
    """
    This class combines the feature engineering and feature selection
    class to one to preprocess the data.
    It takes in the cleaned data and the y_column_name(ratings)
    """
    def __init__(self, data, y_column_name):
        self.data = data
        self.y_column_name = y_column_name
        self.feature_engineering = FeatureEngineering(data, y_column_name)
        self.feature_selection = FeatureSelection(data, y_column_name)

    def preprocess_my_data(self, num_of_features_to_select):
        """
        This method preprocesses the cleaned data and performs
        feature selection to select best n-features
        :param num_of_features_to_select: n-best features of the model
        :return: Full preprocesse data with n-selected features
        """
        self.data = self.feature_engineering.input_rare_categorical()
        self.data = self.feature_engineering.encode_categorical_features()
        self.data = self.feature_engineering.scale_features()
        self.data = self.feature_selection.perform_feature_selection(
            num_of_features_to_select)
        return self.data
class FeatureSelection:
    def __init__(self, train, test, id_column, y_column_name):
        self.train = train
        self.number_of_train = train.shape[0]
        self.y_column_name = y_column_name
        self.id_column = id_column
        self.test = test
        self.data = pd.concat([train, test], ignore_index=True)
        self.feature_engineering = FeatureEngineering(train, test, id_column,
                                                      y_column_name)
        self.y = self.data[[self.id_column, self.y_column_name]]

    def preprocess_my_data(self):
        self.data = self.feature_engineering.fill_na_categorical()
        self.data = self.feature_engineering.fill_na_numerical()
        self.data = self.feature_engineering.input_rare_categorical()
        self.data = self.feature_engineering.label_encoder()
        self.data = self.feature_engineering.get_scale_features()
        return self.data

    def perform_feature_selection(self, num_of_features_to_select):
        data = self.preprocess_my_data()
        train_data = data[:self.number_of_train]
        ytrain = train_data[[self.y_column_name]]
        xtrain = train_data.drop([self.id_column, self.y_column_name], axis=1)
        feature_sel_model = ExtraTreesRegressor().fit(xtrain, ytrain)
        feat_importances = pd.Series(feature_sel_model.feature_importances_,
                                     index=xtrain.columns)
        selected_features = feat_importances.nlargest(
            num_of_features_to_select)
        selected_features_df = selected_features.to_frame()
        selected_features_list = selected_features_df.index.tolist()
        data = self.data[selected_features_list]
        self.data = pd.concat([self.y, data], axis=1)
        return self.data
 def __init__(self,
              *args,
              location_type="Location",
              location_type_plural="Locations"):
     #super(DataViz, self).__init__(*args)
     FeatureEngineering.__init__(self, *args)
     self.location_type = location_type
     self.location_type_plural = location_type_plural
 def __init__(self, train, test, id_column, y_column_name):
     self.train = train
     self.number_of_train = train.shape[0]
     self.y_column_name = y_column_name
     self.id_column = id_column
     self.test = test
     self.data = pd.concat([train, test], ignore_index=True)
     self.feature_engineering = FeatureEngineering(train, test, id_column,
                                                   y_column_name)
     self.y = self.data[[self.id_column, self.y_column_name]]
Esempio n. 6
0
 def __init__(self, train, test, id_column, y_column_name):
     self.train = train
     self.test = test
     self.y_column_name = y_column_name
     self.id_column = id_column
     self.data = pd.concat([train, test], ignore_index=True)
     self.feature_engineering = FeatureEngineering(train, test, id_column,
                                                   y_column_name)
     self.feature_selection = FeatureSelection(train, test, id_column,
                                               y_column_name)
Esempio n. 7
0
    def fit_pipeline(self, input_csv_directory_path, input_csv_file_name):

        print('Start Testing pipeline')

        target_name = 'income'

        try:

            data_test = pd.read_csv(
                os.path.join(input_csv_directory_path, input_csv_file_name))

            print(data_test.head())
            print(data_test.shape)

            data_prepare = DataPrepare()
            df_clean = data_prepare.dataPrepare(data_test)

            feature_engineering = FeatureEngineering()
            df_features_target = feature_engineering.featureEngineering(
                df_clean)

            # Dropping missing values if any
            df_features_target.dropna(axis=0, inplace=True)

            model_pipeline = helper_models.load_pipeline()

            prediction = model_pipeline.predict(
                df_features_target.drop(columns=target_name))
            probability = model_pipeline.predict_proba(
                df_features_target.drop(columns=target_name))

            print(
                "Classification report: \n ",
                classification_report(df_features_target[target_name],
                                      prediction))

            helper_models.fill_confusion_matrix_and_save(
                df_features_target[target_name],
                prediction,
                f_name='Test Confusion matrix',
                out_dir=input_csv_directory_path)

            helper_models.plot_roc_curve_and_save(
                df_features_target[target_name],
                probability,
                f_name='Test Roc Curve',
                out_dir=input_csv_directory_path)

            print(
                'Pipeline completed successfully and results are stored in data directory'
            )

        except Exception as ex:
            print('Something went wrong with the Pipeline %s', ex)
            raise ex
Esempio n. 8
0
 def create_reader_new(self):
     source_type = self._model_desc_obj.get("dataSource").get("source_type")
     parameters = self._model_desc_obj.get("dataSource").get("parameters")
     print('--------- create_reader parameters start ---------')
     for k in sorted(parameters.keys()):
         print(k, parameters[k])
     print('---------- create_reader parameters end ----------')
     if source_type == "kafka":
         reader = self.tensor_dict_from_kafka(parameters)
     elif source_type == "file":
         reader = self.tensor_dict_from_hdfs(parameters)
     fe = FeatureEngineering()
     reader = fe.get_tensor_dict(reader)
     reader.init(self.context)
     return reader
Esempio n. 9
0
    def get_data_and_pipeline(self):
        """
        
        Function to obtain the pipeline object from the feature engineering class
        imported above among the libraries
                
        Args:
            None
        
        Returns:
            None
        
        """

        data = FeatureEngineering(self.data_path)
        X, full_pipeline = data.build_pipe(hash_size=100)
        self.data = X
        self.pipeline = full_pipeline
class FeatureSelection:
    """
    This class peroforms feature selection.
    It takes the cleaned data and the the y_column_name(ratings)
    """
    def __init__(self, data, y_column_name):
        self.data = data
        self.y_column_name = y_column_name
        self.y = self.data[[self.y_column_name]]
        self.feature_engineering = FeatureEngineering(data, y_column_name)

    def preprocess_my_data(self):
        """
        This method preprocessed the data by inputing rare categorical,
        perform feature engineering and feature scaling
        :return: preprocessed full data
        """
        self.data = self.feature_engineering.input_rare_categorical()
        self.data = self.feature_engineering.encode_categorical_features()
        self.data = self.feature_engineering.scale_features()
        return self.data

    def perform_feature_selection(self, num_of_features_to_select):
        """
        This method performs the feature selection technique
        :param num_of_features_to_select: number of best features to select
        :return: full data with n-selected features
        """
        data = self.preprocess_my_data()
        self.train = data[0: 300000]
        label_encoder = LabelEncoder()
        ytrain = self.train[self.y_column_name]
        ytrain= label_encoder.fit_transform(ytrain)
        xtrain = self.train.drop([self.y_column_name], axis=1)
        feature_sel_model = ExtraTreesClassifier().fit(xtrain, ytrain)
        feat_importances = pd.Series(feature_sel_model.feature_importances_, index=xtrain.columns)
        selected_features = feat_importances.nlargest(num_of_features_to_select)
        selected_features_df = selected_features.to_frame()
        selected_features_list = selected_features_df.index.tolist()
        data = self.data[selected_features_list]
        self.data = pd.concat([self.y, data], axis=1)
        return self.data
Esempio n. 11
0
class PreprocessedData:
    def __init__(self, train, test, id_column, y_column_name):
        self.train = train
        self.test = test
        self.y_column_name = y_column_name
        self.id_column = id_column
        self.data = pd.concat([train, test], ignore_index=True)
        self.feature_engineering = FeatureEngineering(train, test, id_column,
                                                      y_column_name)
        self.feature_selection = FeatureSelection(train, test, id_column,
                                                  y_column_name)

    def preprocess_my_data(self, num_of_features_to_select):
        self.data = self.feature_engineering.fill_na_categorical()
        self.data = self.feature_engineering.fill_na_numerical()
        self.data = self.feature_engineering.input_rare_categorical()
        self.data = self.feature_engineering.label_encoder()
        self.data = self.feature_engineering.get_scale_features()
        self.data = self.feature_selection.perform_feature_selection(
            num_of_features_to_select)
        return self.data
Esempio n. 12
0
def setUpDataFrame():
    train_filepath = read_yaml('baseConfig.yaml')
    df_raw = pd.read_csv(train_filepath, low_memory=False, parse_dates=['saledate'] )
    print('The shape of dataframe is %s' %(str(df_raw.shape)))
    cleaning = Cleaning()
    print('Converting sale price to log of sale price')
    df_raw = cleaning.convertFeatureToItsLog(df_raw, 'SalePrice')
    
    print("Turning string to categorical variables")
    df_raw = cleaning.turnStringToCategorical(df_raw)
    #Aligning the levels properly
    df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)

    #converting date and time to features
    feat_eng = FeatureEngineering()
    feat_eng.convertDatesToFeatures(df_raw, 'saledate')
    #saving as feather
    try:
        os.makedirs('tmp', exist_ok=True)
        df_raw.to_feather('tmp/raw')
    except (FileNotFoundError, IOError) as e:
        print(e)
class ProcessedData:
    def __init__(self, train, test, id_column, y_column_name):
        self.train = train
        self.test = test
        self.y_column_name = y_column_name
        self.id_column = id_column
        self.data = pd.concat([self.train, self.test], ignore_index=True)
        self.feature_engineering = FeatureEngineering(self.train, self.test,
                                                      self.id_column,
                                                      self.y_column_name)
        self.feature_selection = FeatureSelection(self.train, self.test,
                                                  self.id_column,
                                                  self.y_column_name)

    def preprocess_my_data(self, num_of_features_to_select):
        self.data = self.feature_engineering.scale_features()
        self.data = self.feature_selection.perform_feature_selection(
            num_of_features_to_select)
        return self.data
Esempio n. 14
0
def main(data_directory_path, merge_csv_file_name, prepared_csv_file_name,
         features_target_csv_file_name):

    print("Model Process starts")

    #path = "E:\PlusDental_Task\sample_data"
    #merge_file_name = "data_merged.csv"
    #prepared_file_name = "data_prepared.csv"
    #feature_target_file_name = "features_target.csv"

    start = time.time()

    data_read_and_merge = DataReadAndMerge(data_directory_path,
                                           merge_csv_file_name)
    # data_read_and_merge.readAndMerge(path,merge_file_name)

    data_prepare = DataPrepare(data_directory_path, merge_csv_file_name)
    #data_prepare.dataPrepare(path, merge_file_name)

    #data_prepared = pd.read_csv(os.path.join(data_directory_path, prepared_csv_file_name))
    #print(data_prepared.head())
    #print(data_prepared.shape)

    #data_explore = DataExploration(data_prepared)
    #data_explore.dataExploration(data_prepared)

    feature_engineering = FeatureEngineering(data_directory_path,
                                             prepared_csv_file_name)
    #feature_engineering.featureEngineering(path,prepared_file_name)

    modelling = Modelling(data_directory_path, features_target_csv_file_name)
    #modelling.modelling(data_directory_path, features_target_csv_file_name)

    model_pipeline = ModelPipeline(data_read_and_merge, data_prepare,
                                   feature_engineering, modelling)
    model_pipeline.fit(data_directory_path, merge_csv_file_name,
                       prepared_csv_file_name, features_target_csv_file_name)

    print("Model Process ends", time.time() - start, "s")
Esempio n. 15
0
def main(data_directory_path):

    print("Model Process starts")

    start = time.time()

    data_read = DataRead(data_directory_path)

    data_prepare = DataPrepare()

    data_explore = DataExploration()

    feature_engineering = FeatureEngineering()

    modelling = Modelling()

    model_pipeline = ModelPipeline(data_read, data_explore, data_prepare,
                                   feature_engineering, modelling)

    model_pipeline.fit(data_directory_path)

    print("Model Process ends", time.time() - start, "s")
Esempio n. 16
0
    def __init__(self):

        fe = FeatureEngineering()
        fe.feature_age()
        fe.feature_days_admitted()
        fe.feature_total_medical_history()
        fe.feature_total_preop_medication()
        fe.feature_total_symptoms()
        fe.feature_lab_results_ratios()

        df_data = self.__drop_cols(fe.df_data)
        self.df_data = self.__get_dummy_vars(df_data)
# from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import numpy as np

from feature_engineering import FeatureEngineering
from cross_validation import CrossValidation
from multi_log_loss import multi_log_loss

f = FeatureEngineering('../data/gender_age_train.csv',
                       '../data/gender_age_test.csv',
                       'device_id',
                       wide_files=[#'../features/apps_per_event.csv', '../features/avg_position.csv',
                                   #'../features/count_by_hour.csv', '../features/count_by_period.csv',
                                   '../features/event_counts.csv', '../features/sd_position.csv'],
                       long_files=[#'../features/active_app_category_counts.csv',
                                   #'../features/installed_app_category_counts.csv',
                                   '../features/phone_brand.csv'])
labels, features, colnames = f.extract_features()
labels.set_index(np.arange(labels.shape[0]), inplace=True)
colnames.set_index(np.arange(colnames.shape[0]), inplace=True)
train_filter = [i for i, x in enumerate(labels['age'].tolist()) if not np.isnan(x)]
test_filter = [i for i, x in enumerate(labels['age'].tolist()) if np.isnan(x)]

cv = CrossValidation(features[train_filter, :],
                     labels.ix[train_filter, 'group'],
                     features[test_filter, :],
                     multi_log_loss)
model = MultinomialNB()
model.predict = model.predict_proba
out = cv.run(model, 'test')
Esempio n. 18
0
 def __init__(self):
     Helpers.__init__(self)
     WordEmbeddings.__init__(self)
     FeatureEngineering.__init__(self)
from feature_engineering import FeatureEngineering

f = FeatureEngineering('ex_train.csv', 'ex_test.csv', 'key',
                       ['ex_wide_1.csv', 'ex_wide_2.csv'], 'ex_long.csv')
d = f.extract_features()
import mlflow.tensorflow

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

from data_processing import DataProcessing
from feature_engineering import FeatureEngineering
from classifier import QuestionAnswerClassifer

if __name__ == '__main__':

    Feature_Engineering = FeatureEngineering(new_df, 'Question', 'Label',
                                             'Sentence')

    maxlen = Feature_Engineering.determine_maxlen() + 10
    print(f'Maxlen:{maxlen}')

    #Split the dataset into train and test set
    new_df['Question_Sentence'] = new_df['Question'] + ' ' + new_df['Sentence']
    features = new_df['Question_Sentence']
    target = new_df['Label']
    x_train, x_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.25,
                                                        random_state=0)

    tokenizer = Feature_Engineering.text_tokenize(x_train.values,
                                                  num_words=5000)
 def __init__(self, data, y_column_name):
     self.data = data
     self.y_column_name = y_column_name
     self.y = self.data[[self.y_column_name]]
     self.feature_engineering = FeatureEngineering(data, y_column_name)
Esempio n. 22
0
    print("Turning string to categorical variables")
    df_raw = cleaning.turnStringToCategorical(df_raw)
    #Aligning the levels properly
    df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)

    #converting date and time to features
    feat_eng = FeatureEngineering()
    feat_eng.convertDatesToFeatures(df_raw, 'saledate')
    #saving as feather
    try:
        os.makedirs('tmp', exist_ok=True)
        df_raw.to_feather('tmp/raw')
    except (FileNotFoundError, IOError) as e:
        print(e)

feat_eng = FeatureEngineering()
print(feat_eng.testIfDateTimeWorks());


base_config = read_yaml('baseConfig.yaml')

#Reading files
try:
    df_raw = pd.read_feather(base_config.parameters.bulldozer_train_feather)
    print('Finished reading feather file')
    if 'saleYear' in df_raw.columns:
        print('Features from dates are present in this feather file')
except (IOError, OSError) as e:
    print('Feather file does not exist')
    print(e)
    print('Doing the initial setup')
 def __init__(self, data, y_column_name):
     self.data = data
     self.y_column_name = y_column_name
     self.feature_engineering = FeatureEngineering(data, y_column_name)
     self.feature_selection = FeatureSelection(data, y_column_name)