def __init__(self): # print(train_data) pp = preprocessing_data() train_data = pp.drop_col('Alley') train_data = pp.drop_col('PoolQC') train_data = pp.drop_col('Fence') train_data = pp.drop_col('MiscFeature') pp.fill_null_values() print(train_data.columns)
def test_train_Multi(X,Y): ''' Input: X and Y dataframe after using data_generating class. X shape: (number of gate-net,49 unprocessed features) Y shape: (number of labels,) Split the data into train and test with split ratio 80/20 and using the preprocessing_data class to transform the data for each of test and train. And then, split gate and net data into seperate dataframes Return dataframes: gate_train,gate_test,net_train,net_test,Y_train,Y_test gate_train shape: (number of gate-net*0.8,17 processed features) gate_test shape:(number of gate-net*0.2,17 processed features) net_train shape:(number of gate-net*0.8,24 processed features) net_test shape:(number of gate-net*0.2,24 processed features) ''' split_Y = int(len(Y)*0.8) split = split_Y*10 X_train = X[:split ] X_test = X[split :] Y_train = Y[:split_Y] Y_test = Y[split_Y:] preprocess = preprocessing_data(X,0.8) X_train = preprocess.filter_columns(X_train) X_train = preprocess.normalise(X_train) X_train = preprocess.categorical(X_train) X_train = preprocess.location_transform(X_train) X_test = preprocess.filter_columns(X_test) X_test = preprocess.normalise(X_test) X_test = preprocess.categorical(X_test) X_test = preprocess.location_transform(X_test) X_test.index = [i for i in range(1,len(X_test)+1)] Y_test.index = [i for i in range(len(Y_test))] Y_train = generating_tensor_Y(Y_train) Y_test = generating_tensor_Y(Y_test) gate_train = pd.concat([X_train.iloc[:,:8],X_train.iloc[:,-9:]],axis=1) net_train = X_train.iloc[:,8:-9] gate_test = pd.concat([X_test.iloc[:,:8],X_test.iloc[:,-9:]],axis=1) net_test = X_test.iloc[:,8:-9] return gate_train,gate_test,net_train,net_test,Y_train,Y_test
def test_train(X,Y): ''' Input: X and Y dataframe after using data_generating class. X shape: (number of gate-net,49 unprocessed features) Y shape: (number of labels,) Split the data into train and test with split ratio 80/20 and using the preprocessing_data class to transform the data for each of test and train data. Return X_train,Y_train,X_test,Y_test: The splitted train/test tensors X_train shape (number of gate-net*0.8,41 processed features)) Y_train shape (number of labels*0.8,) X_test shape (number of gate-net*0.2,41 processed features)) Y_test shape (number of labels*0.2,) ''' split_Y = int(len(Y)*0.8) split = split_Y*10 X_train = X[:split] X_test = X[split :] Y_train = Y[:split_Y] Y_test = Y[split_Y:] preprocess = preprocessing_data(X,0.8) X_train = preprocess.filter_columns(X_train) X_train = preprocess.normalise(X_train) X_train = preprocess.categorical(X_train) X_train = preprocess.location_transform(X_train) X_test = preprocess.filter_columns(X_test) X_test = preprocess.normalise(X_test) X_test = preprocess.categorical(X_test) X_test = preprocess.location_transform(X_test) X_test.index = [i for i in range(1,len(X_test)+1)] Y_test.index = [i for i in range(len(Y_test))] X_train = generating_tensor_X(X_train) Y_train = generating_tensor_Y(Y_train) X_test = generating_tensor_X_test(X_test) Y_test = generating_tensor_Y(Y_test) return X_train,Y_train,X_test,Y_test
"2 - exit \n" option = int(input(menu_message)) # data should be preprocessed and saved to disk # with this enabled, reading from disk will take time load_from_disk = False if option >= 0 or option <= 2: load_target_column() if option == 0: print('Classifying started!') if load_from_disk: # TODO: fix this if someone wants to read data from disk preprocessed_data = load_preprocessed_data_from_disk() else: preprocessed_data = preprocessing_data(False) # return one bag of words initialize_data(preprocessed_data) classifying() elif option == 1: print('Ranking started!') preprocessed_data = preprocessing_data(True) # return two bag of words ''' set index 0, for testing purposing of each method increase this index and comment other preprocessing methods because of we don't want to load all data in RAM ''' start_ranking(preprocessed_data[0][0], preprocessed_data[0][1]) correct_input = True else: print('Incorrect input')
# X.index = [i for i in range(len(X))] # X = X.drop(X.index[0]) # X.to_pickle("Data.pkl") # Y = pd.DataFrame(Y_init) # Y.to_pickle("label724") ##### save the initial data ############## X = pd.read_pickle('Data.pkl') Y = pd.read_pickle("label724") ## train test split split_Y = int(len(Y) * 0.8) split = split_Y * 10 X_train = X[:split] X_test = X[split:] Y_train = Y[:split_Y] Y_test = Y[split_Y:] preprocess = preprocessing_data(X, 0.8) X_train = preprocess.filter_columns(X_train) X_train = preprocess.normalise(X_train) X_train = preprocess.categorical(X_train) X_train = preprocess.location_transform(X_train) X_test = preprocess.filter_columns(X_test) X_test = preprocess.normalise(X_test) X_test = preprocess.categorical(X_test) X_test = preprocess.location_transform(X_test) X_test.index = [i for i in range(1, len(X_test) + 1)] Y_test.index = [i for i in range(len(Y_test))] SVM_X_Train = generating_tensor_X_SVM(X_train) SVM_X_Test = generating_tensor_X_SVM(X_test) Y_train = generating_tensor_Y(Y_train) Y_test = generating_tensor_Y(Y_test) parameters = {
default="softmax", help="activate function") parser.add_argument("-v", "--verbose", action="store_true", help="display each epoch on training") return parser.parse_args() if __name__ == '__main__': args = parse_args() try: df = pd.read_csv(args.dataset, header=None) except Exception as e: sys.exit(print("{}: {}".format(type(e).__name__, e))) X_train, X_test, Y_train, Y_test = preprocessing_data(df, args.activate) if args.model == "train": num_iterations = 56000 learning_rate = 0.007 layers_dims = [X_train.shape[0], 40, 20, 10, 5, 1 ] if args.activate == "sigmoid" else [ X_train.shape[0], 40, 20, 10, 5, 2 ] parameters = L_layer_model(X_train, Y_train, X_test, Y_test, layers_dims, learning_rate, num_iterations, args.activate,
def preprocessing(): """ preprocess scraped data """ preprocessing_data() return jsonify("news preprocessed succefuly"), 200
import extraction import preprocessing import string_similarity """ The following code executes the contracting chain script and returns a CSV with the contracting chain for all interadministrative contracts of INVIAS """ #Extraction df_entity_raw, df_names_raw = extraction.extracting_data() # Preprocessing df_entity_clean, names_mun_clean = preprocessing.preprocessing_data( df_entity_raw, df_names_raw) test_names = [ "HUILA - ALCALDÍA MUNICIPIO DE NEIVA", "SANTANDER - ALCALDÍA MUNICIPIO DE BUCARAMANGA", "VALLE DEL CAUCA - ALCALDÍA MUNICIPIO DE PALMIRA" ] # test_names = names_mun_clean # Chain construction chain = string_similarity.contracting_chain(test_names, 3, df_entity_clean) # Printing csv chain.to_csv('contracting_chain.csv')
def preprocessing(): preprocessing_data() return jsonify("Les nouvelles traitées !!"), 200