def main():
    train_path = "data/data.json"
    #test_path = "data/test.csv"
    dc_train = DataCleaning(train_path)
    #dc_test = DataCleaning(test_path)
    X_train, y_train = dc_train.clean()
    #X_test, y_test = dc_test.clean()

    # dc_train_reg = DataCleaning(train_path)
    # dc_test_reg = DataCleaning(test_path)
    # X_train_reg, y_train_reg = dc_train_reg.clean(regression=True)
    # X_test_reg, y_test_reg = dc_test_reg.clean(regression=True)

    train_col_names = dc_train.get_column_names()
    # train_col_names_reg = dc_train_reg.get_column_names()

    rf = RandomForestClassifier
    gb = GradientBoostingClassifier
    logr = LogisticRegression
    svm_model = svm.SVC

    pipe = Pipeline([gb])
    pipe.fit_predict(X_train, y_train)
    pipe.print_cv_results(train_col_names, X_train, y_train)

    with open('model.pkl', 'w') as f:
        pickle.dump(pipe.trained_models[1], f)
 def clean_text(self):
     logging.info("Text Cleaning")
     dc = DataCleaning()
     texts = []
     for line in tqdm(self.dataset["COMPLAINT_TEXT"],
                      total=self.dataset.shape[0]):
         texts.append(dc.normalize(line))
     return texts
Exemple #3
0
def main_deprecated():
    # This is deprecated, never use this please.
    print("This is main, alhumdulliah")
    ##### This block is for data cleaning #####
    missing_values = ["n/a", "na", "--", "?"]
    raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                           delimiter=',', na_values=missing_values)
    # print(raw_data.head()) # print head of the data
    # print(raw_data.describe()) # shows numerical columns statistics e.g. count, mean, std, min, max etc
    # print(raw_data.shape) # prints shape of the dataset (101766, 50)
    # print(raw_data["weight"].isnull().sum()) #prints number of null values in weight column
    # print(raw_data["weight"].shape[0]) #prints number of columns in weight column
    data_cleaning = DataCleaning()
    raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2)
    cols_having_missing_values = data_cleaning.get_cols_having_missing_values(
        raw_data, False)  # cols having missing values
    # raw_data.dtypes #shows the column data types
    raw_data = data_cleaning.fill_missing_values(
        raw_data, cols_having_missing_values)
    # print(get_cols_having_missing_values(raw_data, False)) #no columns with missing values
    raw_data = data_cleaning.just_remove_columns(raw_data, columns=[
                                                 "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"])
    df = raw_data
    my_util = Util()
    my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
    print("Filled the missing values either by the mode or mean value")
def ingest():
    seed = 1000

    data = DataInitializer()
    data.initialize("data/train.csv")

    data = DataCleaning(data)
    data.cleanup(DataCleaner())

    data = Sentiments(data)
    data.sentiment_analysis_by_text()

    data = data.processed_data[['sentiment', 'text']]
    print('dataset loaded with shape', data.shape)
    print("Distribution of sentiments: ",
          pd.Series(data["sentiment"]).value_counts())

    # data["sentiment"] = data["sentiment"].map(codes)

    return data
Exemple #5
0
def clean():
    missing_values = ["n/a", "na", "--", "?"]
    raw_data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                           delimiter=',', na_values=missing_values)
    data_cleaning = DataCleaning()
    raw_data = data_cleaning.clean_columns(raw_data, missing_bound=.2)
    cols_having_missing_values = data_cleaning.get_cols_having_missing_values(
        raw_data, False)  # cols having missing values
    raw_data = data_cleaning.fill_missing_values(
        raw_data, cols_having_missing_values)
    raw_data = data_cleaning.just_remove_columns(raw_data, columns=[
                                                 "encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "num_procedures"])
    df = raw_data
    my_util = Util()
    my_util.save_df(df, "../only_calculated_datasets/cleaned_df.pkl")
 def clean_text(self, text):
     dc = DataCleaning()
     return [dc.normalize(text)]
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None):
    if duration:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if os.path.isfile("data/BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data)
    data.sentiment_analysis_by_text()
    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv("data/clean_test_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/clean_train_with_sentiments.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    data = DataTokenize(data)
    data.tokenize()
    data.stem()

    data = WordList(data)
    data.build_wordlist(min_occurrences=min_occurrences)

    word2vec_data = data
    data = BagOfWords(data.processed_data, data.wordlist, is_testing)
    data.build_data_model()
    print("data model head: ", data.data_model.head(5))
    """
    Word 2 vec
    """

    word2vec = Word2VecProvider()

    # REPLACE PATH TO THE FILE
    word2vec.load("../twitter/data/glove.twitter.27B.200d.txt")

    word2vec_data = RedditData(word2vec_data)
    word2vec_data.build_final_model(word2vec)

    word2vec_data_model = word2vec_data.data_model
    if "index" in word2vec_data_model.columns:
        word2vec_data_model.drop("index", axis=1, inplace=True)
    word2vec_data_model.dropna(axis=0, inplace=True)
    word2vec_data_model.reset_index(inplace=True)
    word2vec_data_model.index = word2vec_data_model['timestamp_ms']
    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")
    """
    Tokenizing the data
    """
    texts = []
    sentiments = []
    tokenized_data = pd.DataFrame()
    for text in data.processed_data["summary"]:
        texts.append(text)
    for sentiment in data.processed_data["sentiment"]:
        sentiments.append(sentiment)
    print("texts: ", texts[0:5])
    tokenizer = Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=200)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not is_testing:
        data = Plotting(data)
        data.plot()

    if cache_bow_output is not None:
        data.data_model.to_csv(cache_bow_output,
                               index=False,
                               float_format="%.6f")
        word2vec_data_model.to_csv(cache_word2vec_output,
                                   index=False,
                                   float_format="%.6f")
        with open('sequences', 'wb') as fp:
            pickle.dump(padded_sequences, fp)
        with open('sentiments', 'wb') as fp:
            pickle.dump(sentiments, fp)

    return data.data_model, word2vec_data_model
#path to root folder
root = '../'

sys.path.insert(0, root + 'pre_clustering/')
import read_tcga_data

sys.path.insert(1, root + 'cleaning/')
from data_cleaning import DataCleaning

#Load data
gene_df = read_tcga_data.read_data(root)
print(gene_df.info())

#clean data - uncomment desired method
cleaner = DataCleaning()
cleaner.check_sparsity(gene_df)
gene_df = cleaner.remove_sparsity(gene_df)

#get avg std of tcga data features
STD = sum(gene_df.std()) / len(gene_df.std())

#generate data with one cluster
X, y = make_blobs(n_samples=112,
                  centers=1,
                  n_features=14085,
                  cluster_std=STD,
                  random_state=0)
generated_df = pd.DataFrame(X)

k_finder = KFinder()
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from visualization import Visualization
from data_cleaning import DataCleaning

missing_values = ["n/a", "na", "--", "?"]
data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                   delimiter=',',
                   na_values=missing_values)

data_cleaning = DataCleaning()
data = data_cleaning.clean_columns(data, missing_bound=0.2)

colsMissingValues = data_cleaning.get_cols_having_missing_values(data, False)
data = data_cleaning.fill_missing_values(data, colsMissingValues)
"""
data = data.values
features = []
for i in range(50):
	if isinstance(data[0][i], str):
		a = np.unique(data[:,i])
		features.append(a)
"""

data = data.to_numpy()
Exemple #10
0
def preprocess(data_path,
               is_testing,
               min_occurrences=5,
               cache_bow_output=None,
               cache_word2vec_output=None,
               duration=None,
               sentiment_method=None):
    if duration and cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path, is_testing, duration=duration)
    elif cache_bow_output and cache_word2vec_output:
        data = DataInitializer()
        data.initialize(data_path,
                        is_testing,
                        cache_bow_output=cache_bow_output,
                        cache_word2vec_output=cache_word2vec_output)
    else:
        data = DataInitializer()
        data.initialize(data_path, is_testing)

    if not os.path.isfile("data/Train_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    if not os.path.isfile("data/Test_BTC.csv"):
        prices_data = GetPricesData()
        prices_data.main()

    data = DataCleaning(data, is_testing)
    data.cleanup(DataCleaner(is_testing))

    if is_testing:
        print("Testing data shape:", data.processed_data.shape)
    else:
        print("Training data shape:", data.processed_data.shape)

    data = Sentiments(data, sentiment_method=sentiment_method)
    data.sentiment_analysis_by_text()

    print("First five rows with sentiment: ", data.processed_data.head())
    if is_testing:
        data.processed_data.to_csv(
            "data/one_month_clean_test_data_with_prices.csv",
            sep=',',
            encoding='utf-8',
            index=False)
        # os.remove(data_path)
    else:
        data.processed_data.to_csv("data/one_month_clean_data_with_prices.csv",
                                   sep=',',
                                   encoding='utf-8',
                                   index=False)
        # os.remove(data_path)

    if os.path.isfile(cache_word2vec_output):
        print("cache_word2vec_output file name: ", cache_word2vec_output)
        word2vec_data_model = pd.read_csv(cache_word2vec_output)
        data.data_model = pd.read_csv(cache_bow_output)
        print("data model head: ", data.data_model.head(5))
    else:
        data = DataTokenize(data)
        data.tokenize()
        data.stem()

        data = WordList(data)
        data.build_wordlist(min_occurrences=min_occurrences)

        word2vec_data = data
        data = BagOfWords(data.processed_data, data.wordlist, is_testing)
        data.build_data_model()
        print("data model head: ", data.data_model.head(5))
        """
        Word 2 vec
        """

        word2vec = Word2VecProvider()

        # REPLACE PATH TO THE FILE
        word2vec.load("data/glove.twitter.27B.200d-with2num.txt")
        word2vec_data = TwitterData(word2vec_data)
        word2vec_data.build_final_model(word2vec)
        word2vec_data_model = word2vec_data.data_model

        if "original_id" in word2vec_data_model.columns:
            word2vec_data_model.drop("original_id", axis=1, inplace=True)
        word2vec_data_model.dropna(axis=0, inplace=True)
        word2vec_data_model.reset_index(inplace=True, drop=True)
        word2vec_data_model.index = word2vec_data_model['timestamp']

    print("final word2vec data model: \n", word2vec_data_model.head(), "\n")

    # if not is_testing:
    #     data = Plotting(data)
    #     data.plot()

    if not is_testing:
        if not os.path.isfile("train_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_train_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                          axis=1)
            train_targets = data.processed_data[["close"]]
            print("shape of merged train data: ", merged_train_data.shape)

            with open('data/train_sequences', 'wb') as fp:
                pickle.dump(merged_train_data, fp)
            with open('data/train_prices', 'wb') as fp:
                pickle.dump(train_targets, fp)

            # load the whole embedding into memory
            embeddings_index = dict()
            with open("data/glove.twitter.27B.200d-with2num.txt",
                      "r",
                      encoding="utf-8") as my_file:
                for line in my_file:
                    values = line.split()
                    word = values[0]
                    coefs = numpy.asarray(values[1:], dtype='float32')
                    embeddings_index[word] = coefs
            # f.close()
            print("*" * 80, "\n" * 10)
            print('Loaded %s train word vectors.' % len(embeddings_index))
            print('Total %s of word indexes.' % len(tokenizer.word_index))

            with open('data/embeddings_index', 'wb') as fp:
                pickle.dump(embeddings_index, fp)
            with open('data/train_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # encode class values as integers
            # encoder = LabelEncoder()
            # encoder.fit(sentiments)
            # encoded_sentiments = encoder.transform(sentiments)

            # convert integers to dummy variables (i.e. one hot encoded)
            # dummy_sentiments = np_utils.to_categorical(encoded_sentiments)

            # for text in data.processed_data.loc[data.processed_data['sentiment'] != 0, "text"]:
            #     texts.append(text)
            #
            # for sentiment in data.processed_data.loc[data.processed_data['sentiment'] != 0, "sentiment"]:
            #     sentiments.append(sentiment)

    else:
        if not os.path.isfile("test_sequences"):
            print("\n##########################\n"
                  "Tokenizing the tweets\n"
                  "############################\n")
            texts = []
            sentiments = []
            tokenized_data = pd.DataFrame()

            for text in data.processed_data["text"]:
                texts.append(text)

            for sentiment in data.processed_data['sentiment']:
                sentiments.append(sentiment)

            print("texts: ", texts[0:5])
            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(texts)
            sequences = tokenizer.texts_to_sequences(texts)
            padded_sequences = pad_sequences(sequences,
                                             maxlen=20,
                                             padding='post')
            padded_sequences = pd.DataFrame(data=padded_sequences)

            merged_test_data = pd.concat([
                padded_sequences, data.processed_data[[
                    "high", "low", "open", "quoteVolume", "volume",
                    "weightedAverage"
                ]]
            ],
                                         axis=1)
            test_targets = data.processed_data[["close"]]
            print("shape of merged test data: ", merged_test_data.shape)

            with open('data/test_sequences', 'wb') as fp:
                pickle.dump(merged_test_data, fp)
            with open('data/test_prices', 'wb') as fp:
                pickle.dump(test_targets, fp)
            with open('data/test_word_indexes', 'wb') as fp:
                pickle.dump(tokenizer.word_index, fp)

            # padded_sequences = pd.DataFrame(data=padded_sequences)

    print(
        "\n\n##################################################\npadded sequence head: \n",
        padded_sequences[0:5])
    print(
        "\n####################################################\n padded sequence length \n",
        len(padded_sequences))

    if not os.path.isfile(train_data_word2vec_file_name) or not os.path.isfile(
            test_data_word2vec_file_name):
        if cache_bow_output is not None:
            data.data_model.to_csv(cache_bow_output,
                                   index=False,
                                   float_format="%.6f")
            word2vec_data_model.to_csv(cache_word2vec_output,
                                       index=False,
                                       float_format="%.6f")
    return data.data_model, word2vec_data_model
Exemple #11
0
def main():
    train_path = "../data/churn_train.csv"
    test_path = "../data/churn_test.csv"
    dc_train = DataCleaning(train_path)
    dc_test = DataCleaning(test_path)
    X_train, y_train = dc_train.clean()
    X_test, y_test = dc_test.clean()

    dc_train_reg = DataCleaning(train_path)
    dc_test_reg = DataCleaning(test_path)
    X_train_reg, y_train_reg = dc_train_reg.clean(regression=True)
    X_test_reg, y_test_reg = dc_test_reg.clean(regression=True)

    train_col_names = dc_train.get_column_names()
    train_col_names_reg = dc_train_reg.get_column_names()

    rf = RandomForestClassifier
    gb = GradientBoostingClassifier
    logr = LogisticRegression

    pipe = Pipeline([rf, gb])
    pipe.fit_predict(X_train, y_train)
    pipe.print_cv_results(train_col_names, X_train, y_train)

    pipe2 = Pipeline([logr])
    pipe2.fit_predict(X_train_reg, y_train_reg)
    pipe2.print_cv_results(train_col_names_reg, X_train_reg, y_train_reg)

    plot_rocs([pipe, pipe2], [[X_train, y_train], [X_train_reg, y_train_reg]])

    test_scores = pipe.score(X_test, y_test)
Exemple #12
0
def main():
    #Data Cleaning

    missing_values = ["n/a", "na", "--", "?"]
    data = pd.read_csv('../dataset_diabetes/diabetic_data.csv',
                       delimiter=',',
                       na_values=missing_values)

    data_cleaning = DataCleaning()
    data = data_cleaning.clean_columns(data, missing_bound=0.2)

    colsMissingValues = data_cleaning.get_cols_having_missing_values(
        data, False)
    data = data_cleaning.fill_missing_values(data, colsMissingValues)

    #Data Cleaning Done

    data = data.to_numpy()

    le = LabelEncoder()

    for i in range(50):
        if isinstance(data[0][i], str):
            data[:, i] = le.fit_transform(data[:, i])

    print(data)
    print(data.shape)

    X_train, X_test = data[0:80000, 0:49], data[80000:101766, 0:49]
    Y_train, Y_test = data[0:80000, 49:50], data[80000:101766, 49:50]
    Y_train, Y_test = Y_train.astype('int'), Y_test.astype('int')

    print(X_train)
    print(X_train.shape)
    print(Y_train)
    print(Y_train.shape)

    grid_params = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [2, 4, 6],
        'min_samples_leaf': [0.02, 0.04],
        'min_samples_split': [0.2, 0.5, 0.8]
    }

    dt = DecisionTreeClassifier(random_state=50)

    # Builds a model for each possible combination of all the hyperparamter values provided using cv = 5 (5 fold cross validation)
    # cv = 5, builds a 5 fold cross validated GridSearch Object
    # Set scoring parameter as accuracy as we choose the best model based on the accuracy value
    grid_object = GridSearchCV(estimator=dt,
                               param_grid=grid_params,
                               scoring='accuracy',
                               cv=5,
                               n_jobs=-1)

    print "\nHyper Parameter Tuning Begins\n"
    # fit grid object to the training data
    grid_object.fit(X_train, Y_train)

    print "\n\nBest Param Values \t\t\n\n"
    print(grid_object.best_params_)

    #---- Hyper Parameter Tuning Ends ----

    #----- Reporting Accuracy on Test Set using Model with Best Parameters learned through Hyper Parameter Tuning -----------

    #Building Decision Tree With Best Parameters learned from Hyper Parameter Tuning
    best_params = grid_object.best_params_
    dt = DecisionTreeClassifier(
        criterion=best_params['criterion'],
        splitter=best_params['splitter'],
        max_depth=best_params['max_depth'],
        min_samples_leaf=best_params['min_samples_leaf'],
        min_samples_split=best_params['min_samples_split'],
        random_state=50)

    #dt = DecisionTreeClassifier(criterion='gini')
    dt.fit(X_train, Y_train)
    Y_pred = dt.predict(X_test)

    print "Accuracy score Test = ", accuracy_score(Y_test, Y_pred) * 100

    print "Accuracy score 5-Fold = ", kFoldVal(X_train, Y_train, dt, 5)
import numpy as np
import pandas as pd
from data_cleaning import DataCleaning, accuracy, recall
from sklearn.model_selection import train_test_split

if __name__ == '__main__':
    df = pd.read_csv('data/churn_train.csv')
    # creating dependent churn variables
    # labelled customers churned if they hadn't used the service in the last
    # month

    condition = df['last_trip_date'] < '2014-06-01'
    df['churn'] = 1
    df.loc[~condition, 'churn'] = 0
    y = df['churn']
    clean = DataCleaning()
    df = clean.transform(df)

    #p = Pipeline([
    #    ('dc', DataCleaning()),
    #    ('rf', RandomForestClassifier())
    #])

    # GridSearch for RF
    params = {
        'n_estimators': [100, 200, 500],
        'max_depth': [3, 5, 7],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    gb_params = {