def main(): train_path = "data/data.json" #test_path = "data/test.csv" dc_train = DataCleaning(train_path) #dc_test = DataCleaning(test_path) X_train, y_train = dc_train.clean() #X_test, y_test = dc_test.clean() # dc_train_reg = DataCleaning(train_path) # dc_test_reg = DataCleaning(test_path) # X_train_reg, y_train_reg = dc_train_reg.clean(regression=True) # X_test_reg, y_test_reg = dc_test_reg.clean(regression=True) train_col_names = dc_train.get_column_names() # train_col_names_reg = dc_train_reg.get_column_names() rf = RandomForestClassifier gb = GradientBoostingClassifier logr = LogisticRegression svm_model = svm.SVC pipe = Pipeline([gb]) pipe.fit_predict(X_train, y_train) pipe.print_cv_results(train_col_names, X_train, y_train) with open('model.pkl', 'w') as f: pickle.dump(pipe.trained_models[1], f)
def clean_text(self): logging.info("Text Cleaning") dc = DataCleaning() texts = [] for line in tqdm(self.dataset["COMPLAINT_TEXT"], total=self.dataset.shape[0]): texts.append(dc.normalize(line)) return texts
def main_deprecated(): # This is deprecated, never use this please. print("This is main, alhumdulliah") ##### This block is for data cleaning ##### missing_values = ["n/a", "na", "--", "?"] raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) # print(raw_data.head()) # print head of the data # print(raw_data.describe()) # shows numerical columns statistics e.g. count, mean, std, min, max etc # print(raw_data.shape) # prints shape of the dataset (101766, 50) # print(raw_data["weight"].isnull().sum()) #prints number of null values in weight column # print(raw_data["weight"].shape[0]) #prints number of columns in weight column data_cleaning = DataCleaning() raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2) cols_having_missing_values = data_cleaning.get_cols_having_missing_values( raw_data, False) # cols having missing values # raw_data.dtypes #shows the column data types raw_data = data_cleaning.fill_missing_values( raw_data, cols_having_missing_values) # print(get_cols_having_missing_values(raw_data, False)) #no columns with missing values raw_data = data_cleaning.just_remove_columns(raw_data, columns=[ "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"]) df = raw_data my_util = Util() my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl") print("Filled the missing values either by the mode or mean value")
def ingest(): seed = 1000 data = DataInitializer() data.initialize("data/train.csv") data = DataCleaning(data) data.cleanup(DataCleaner()) data = Sentiments(data) data.sentiment_analysis_by_text() data = data.processed_data[['sentiment', 'text']] print('dataset loaded with shape', data.shape) print("Distribution of sentiments: ", pd.Series(data["sentiment"]).value_counts()) # data["sentiment"] = data["sentiment"].map(codes) return data
def clean(): missing_values = ["n/a", "na", "--", "?"] raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2) cols_having_missing_values = data_cleaning.get_cols_having_missing_values( raw_data, False) # cols having missing values raw_data = data_cleaning.fill_missing_values( raw_data, cols_having_missing_values) raw_data = data_cleaning.just_remove_columns(raw_data, columns=[ "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"]) df = raw_data my_util = Util() my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
def clean_text(self, text): dc = DataCleaning() return [dc.normalize(text)]
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None): if duration: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) else: data = DataInitializer() data.initialize(data_path, is_testing) if os.path.isfile("data/BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv("data/clean_test_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/clean_train_with_sentiments.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("../twitter/data/glove.twitter.27B.200d.txt") word2vec_data = RedditData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "index" in word2vec_data_model.columns: word2vec_data_model.drop("index", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True) word2vec_data_model.index = word2vec_data_model['timestamp_ms'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") """ Tokenizing the data """ texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["summary"]: texts.append(text) for sentiment in data.processed_data["sentiment"]: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer(num_words=20000) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=200) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not is_testing: data = Plotting(data) data.plot() if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") with open('sequences', 'wb') as fp: pickle.dump(padded_sequences, fp) with open('sentiments', 'wb') as fp: pickle.dump(sentiments, fp) return data.data_model, word2vec_data_model
#path to root folder root = '../' sys.path.insert(0, root + 'pre_clustering/') import read_tcga_data sys.path.insert(1, root + 'cleaning/') from data_cleaning import DataCleaning #Load data gene_df = read_tcga_data.read_data(root) print(gene_df.info()) #clean data - uncomment desired method cleaner = DataCleaning() cleaner.check_sparsity(gene_df) gene_df = cleaner.remove_sparsity(gene_df) #get avg std of tcga data features STD = sum(gene_df.std()) / len(gene_df.std()) #generate data with one cluster X, y = make_blobs(n_samples=112, centers=1, n_features=14085, cluster_std=STD, random_state=0) generated_df = pd.DataFrame(X) k_finder = KFinder()
from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score from sklearn import tree from sklearn.model_selection import GridSearchCV from sklearn.model_selection import KFold from sklearn.preprocessing import LabelEncoder, OneHotEncoder from visualization import Visualization from data_cleaning import DataCleaning missing_values = ["n/a", "na", "--", "?"] data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() data = data_cleaning.clean_columns(data, missing_bound=0.2) colsMissingValues = data_cleaning.get_cols_having_missing_values(data, False) data = data_cleaning.fill_missing_values(data, colsMissingValues) """ data = data.values features = [] for i in range(50): if isinstance(data[0][i], str): a = np.unique(data[:,i]) features.append(a) """ data = data.to_numpy()
def preprocess(data_path, is_testing, min_occurrences=5, cache_bow_output=None, cache_word2vec_output=None, duration=None, sentiment_method=None): if duration and cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, duration=duration) elif cache_bow_output and cache_word2vec_output: data = DataInitializer() data.initialize(data_path, is_testing, cache_bow_output=cache_bow_output, cache_word2vec_output=cache_word2vec_output) else: data = DataInitializer() data.initialize(data_path, is_testing) if not os.path.isfile("data/Train_BTC.csv"): prices_data = GetPricesData() prices_data.main() if not os.path.isfile("data/Test_BTC.csv"): prices_data = GetPricesData() prices_data.main() data = DataCleaning(data, is_testing) data.cleanup(DataCleaner(is_testing)) if is_testing: print("Testing data shape:", data.processed_data.shape) else: print("Training data shape:", data.processed_data.shape) data = Sentiments(data, sentiment_method=sentiment_method) data.sentiment_analysis_by_text() print("First five rows with sentiment: ", data.processed_data.head()) if is_testing: data.processed_data.to_csv( "data/one_month_clean_test_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) else: data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv", sep=',', encoding='utf-8', index=False) # os.remove(data_path) if os.path.isfile(cache_word2vec_output): print("cache_word2vec_output file name: ", cache_word2vec_output) word2vec_data_model = pd.read_csv(cache_word2vec_output) data.data_model = pd.read_csv(cache_bow_output) print("data model head: ", data.data_model.head(5)) else: data = DataTokenize(data) data.tokenize() data.stem() data = WordList(data) data.build_wordlist(min_occurrences=min_occurrences) word2vec_data = data data = BagOfWords(data.processed_data, data.wordlist, is_testing) data.build_data_model() print("data model head: ", data.data_model.head(5)) """ Word 2 vec """ word2vec = Word2VecProvider() # REPLACE PATH TO THE FILE word2vec.load("data/glove.twitter.27B.200d-with2num.txt") word2vec_data = TwitterData(word2vec_data) word2vec_data.build_final_model(word2vec) word2vec_data_model = word2vec_data.data_model if "original_id" in word2vec_data_model.columns: word2vec_data_model.drop("original_id", axis=1, inplace=True) word2vec_data_model.dropna(axis=0, inplace=True) word2vec_data_model.reset_index(inplace=True, drop=True) word2vec_data_model.index = word2vec_data_model['timestamp'] print("final word2vec data model: \n", word2vec_data_model.head(), "\n") # if not is_testing: # data = Plotting(data) # data.plot() if not is_testing: if not os.path.isfile("train_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_train_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) train_targets = data.processed_data[["close"]] print("shape of merged train data: ", merged_train_data.shape) with open('data/train_sequences', 'wb') as fp: pickle.dump(merged_train_data, fp) with open('data/train_prices', 'wb') as fp: pickle.dump(train_targets, fp) # load the whole embedding into memory embeddings_index = dict() with open("data/glove.twitter.27B.200d-with2num.txt", "r", encoding="utf-8") as my_file: for line in my_file: values = line.split() word = values[0] coefs = numpy.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs # f.close() print("*" * 80, "\n" * 10) print('Loaded %s train word vectors.' % len(embeddings_index)) print('Total %s of word indexes.' % len(tokenizer.word_index)) with open('data/embeddings_index', 'wb') as fp: pickle.dump(embeddings_index, fp) with open('data/train_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # encode class values as integers # encoder = LabelEncoder() # encoder.fit(sentiments) # encoded_sentiments = encoder.transform(sentiments) # convert integers to dummy variables (i.e. one hot encoded) # dummy_sentiments = np_utils.to_categorical(encoded_sentiments) # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]: # texts.append(text) # # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]: # sentiments.append(sentiment) else: if not os.path.isfile("test_sequences"): print("\n##########################\n" "Tokenizing the tweets\n" "############################\n") texts = [] sentiments = [] tokenized_data = pd.DataFrame() for text in data.processed_data["text"]: texts.append(text) for sentiment in data.processed_data['sentiment']: sentiments.append(sentiment) print("texts: ", texts[0:5]) tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) padded_sequences = pad_sequences(sequences, maxlen=20, padding='post') padded_sequences = pd.DataFrame(data=padded_sequences) merged_test_data = pd.concat([ padded_sequences, data.processed_data[[ "high", "low", "open", "quoteVolume", "volume", "weightedAverage" ]] ], axis=1) test_targets = data.processed_data[["close"]] print("shape of merged test data: ", merged_test_data.shape) with open('data/test_sequences', 'wb') as fp: pickle.dump(merged_test_data, fp) with open('data/test_prices', 'wb') as fp: pickle.dump(test_targets, fp) with open('data/test_word_indexes', 'wb') as fp: pickle.dump(tokenizer.word_index, fp) # padded_sequences = pd.DataFrame(data=padded_sequences) print( "\n\n##################################################\npadded sequence head: \n", padded_sequences[0:5]) print( "\n####################################################\n padded sequence length \n", len(padded_sequences)) if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile( test_data_word2vec_file_name): if cache_bow_output is not None: data.data_model.to_csv(cache_bow_output, index=False, float_format="%.6f") word2vec_data_model.to_csv(cache_word2vec_output, index=False, float_format="%.6f") return data.data_model, word2vec_data_model
def main(): train_path = "../data/churn_train.csv" test_path = "../data/churn_test.csv" dc_train = DataCleaning(train_path) dc_test = DataCleaning(test_path) X_train, y_train = dc_train.clean() X_test, y_test = dc_test.clean() dc_train_reg = DataCleaning(train_path) dc_test_reg = DataCleaning(test_path) X_train_reg, y_train_reg = dc_train_reg.clean(regression=True) X_test_reg, y_test_reg = dc_test_reg.clean(regression=True) train_col_names = dc_train.get_column_names() train_col_names_reg = dc_train_reg.get_column_names() rf = RandomForestClassifier gb = GradientBoostingClassifier logr = LogisticRegression pipe = Pipeline([rf, gb]) pipe.fit_predict(X_train, y_train) pipe.print_cv_results(train_col_names, X_train, y_train) pipe2 = Pipeline([logr]) pipe2.fit_predict(X_train_reg, y_train_reg) pipe2.print_cv_results(train_col_names_reg, X_train_reg, y_train_reg) plot_rocs([pipe, pipe2], [[X_train, y_train], [X_train_reg, y_train_reg]]) test_scores = pipe.score(X_test, y_test)
def main(): #Data Cleaning missing_values = ["n/a", "na", "--", "?"] data = pd.read_csv('../dataset_diabetes/diabetic_data.csv', delimiter=',', na_values=missing_values) data_cleaning = DataCleaning() data = data_cleaning.clean_columns(data, missing_bound=0.2) colsMissingValues = data_cleaning.get_cols_having_missing_values( data, False) data = data_cleaning.fill_missing_values(data, colsMissingValues) #Data Cleaning Done data = data.to_numpy() le = LabelEncoder() for i in range(50): if isinstance(data[0][i], str): data[:, i] = le.fit_transform(data[:, i]) print(data) print(data.shape) X_train, X_test = data[0:80000, 0:49], data[80000:101766, 0:49] Y_train, Y_test = data[0:80000, 49:50], data[80000:101766, 49:50] Y_train, Y_test = Y_train.astype('int'), Y_test.astype('int') print(X_train) print(X_train.shape) print(Y_train) print(Y_train.shape) grid_params = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'max_depth': [2, 4, 6], 'min_samples_leaf': [0.02, 0.04], 'min_samples_split': [0.2, 0.5, 0.8] } dt = DecisionTreeClassifier(random_state=50) # Builds a model for each possible combination of all the hyperparamter values provided using cv = 5 (5 fold cross validation) # cv = 5, builds a 5 fold cross validated GridSearch Object # Set scoring parameter as accuracy as we choose the best model based on the accuracy value grid_object = GridSearchCV(estimator=dt, param_grid=grid_params, scoring='accuracy', cv=5, n_jobs=-1) print "\nHyper Parameter Tuning Begins\n" # fit grid object to the training data grid_object.fit(X_train, Y_train) print "\n\nBest Param Values \t\t\n\n" print(grid_object.best_params_) #---- Hyper Parameter Tuning Ends ---- #----- Reporting Accuracy on Test Set using Model with Best Parameters learned through Hyper Parameter Tuning ----------- #Building Decision Tree With Best Parameters learned from Hyper Parameter Tuning best_params = grid_object.best_params_ dt = DecisionTreeClassifier( criterion=best_params['criterion'], splitter=best_params['splitter'], max_depth=best_params['max_depth'], min_samples_leaf=best_params['min_samples_leaf'], min_samples_split=best_params['min_samples_split'], random_state=50) #dt = DecisionTreeClassifier(criterion='gini') dt.fit(X_train, Y_train) Y_pred = dt.predict(X_test) print "Accuracy score Test = ", accuracy_score(Y_test, Y_pred) * 100 print "Accuracy score 5-Fold = ", kFoldVal(X_train, Y_train, dt, 5)
import numpy as np import pandas as pd from data_cleaning import DataCleaning, accuracy, recall from sklearn.model_selection import train_test_split if __name__ == '__main__': df = pd.read_csv('data/churn_train.csv') # creating dependent churn variables # labelled customers churned if they hadn't used the service in the last # month condition = df['last_trip_date'] < '2014-06-01' df['churn'] = 1 df.loc[~condition, 'churn'] = 0 y = df['churn'] clean = DataCleaning() df = clean.transform(df) #p = Pipeline([ # ('dc', DataCleaning()), # ('rf', RandomForestClassifier()) #]) # GridSearch for RF params = { 'n_estimators': [100, 200, 500], 'max_depth': [3, 5, 7], 'max_features': ['auto', 'sqrt', 'log2'] } gb_params = {