def train_and_predict(train_df, test_df): # Data Cleaning # clean the data cleaner = DataCleaner() cleaner.columns_with_no_nan(train_df) cleaner.columns_with_no_nan(test_df) train_df = cleaner.drop_columns(train_df) train_df = cleaner.resolve_nan(train_df) test_df = cleaner.drop_columns(test_df) test_df = cleaner.resolve_nan(test_df) # features engineering train_df, test_df = engineer_features(train_df, test_df) # train the model from Model model = Classifier() model = model.model() # LabelEncoding/OneHotEncoding? train_df = model.encode(train_df) test_df = model.encode(test_df) # training progress and results model = model.train(model, train_df) # predict on test_df with predict method from Model y_test = model.predict(model, test_df) return y_test
def get_cleaned_data(read_cleaned_csv, file_name): if file_name == "": file_name = Constants.FILENAME if not read_cleaned_csv: df_311 = utilFor311.read_data_from_csv(file_name) df_311 = DataCleaner.remove_space_from_col_names(df_311) df_311 = DataCleaner.drop_unwanted_cols(df_311, Constants.DROP_COLS1) df_311 = DataCleaner.drop_below_threshold(df_311) df_311 = DataCleaner.filter_frequent_request_types(df_311) df_311 = DataCleaner.capitalize_cols(df_311, Constants.CAPITALIZE_COLS) df_311 = DataCleaner.format_zip_code(df_311) df_311 = DataCleaner.update_burrow_city_from_zip_code(df_311) df_311 = DataCleaner.drop_empty_null_values(df_311) df_311 = DataCleaner.calculate_time_to_resolve_in_seconds(df_311) cleaned_df = DataCleaner.create_separate_day_month_year_col(df_311) cleaned_df = utilFor311.rearrange_cols(cleaned_df) else: cleaned_df = utilFor311.read_data_from_csv(file_name) return cleaned_df
from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import mean_squared_error, r2_score, f1_score from question_query import create_questions_df from answer_query import create_answers_df from data_cleaning import DataCleaner from model_tester import FindOptimalModels if __name__ == '__main__': numrows = 1e6 print("Connecting and getting ~{}".format(numrows)) a = create_answers_df(numrows) print("Got rows, cleaning data") a_train_dc = DataCleaner(a, questions=False, training=True, simple_regression=True, time_split=False, normalize=False) A, b = a_train_dc.get_clean() default_models = [RandomForestRegressor, GradientBoostingRegressor] param_dict = {'rf': {'n_estimators': [50, 100, 5000], 'max_depth': [2, 3, 5]}, 'gbr': {'learning_rate': [.001, .01, .1, .2], 'max_depth': [2, 3, 5], 'n_estimators': [50, 100, 5000]}} print('Finding optimal models') finder = FindOptimalModels(A, b, question=False, time_split=False) finder.baseline_model() fitted_models = finder.run_default_models(default_models) print("starting grid search") opt_params = finder.run_grid_search(fitted_models, param_dict)
def main(): colorama.init() dir_path = os.path.dirname(os.path.realpath(__file__)) inspector = JavaFileInspector("{}/comment_resources".format(dir_path)) comments_data = inspector.get_comments() comments_file = open("comments.txt", "w+") comments = [] print("Comment data obtained") for key, value in comments_data.items(): print(colorama.Fore.GREEN + "[FILE] : " + key) comments_file.write("** [FILE] : " + key + "\n") for item in value: if item.comment not in comments: if len(item.comment.split()) > 10: if not DataCleaner.check_if_copyright(item.comment): print(colorama.Fore.YELLOW + " [COMMENT] : " + item.comment + "\n") comments_file.write(" [COMMENT] : " + item.comment + "\n") comments.append(item.comment) comments_file.write("\n") print(colorama.Fore.RESET) print("Corpus preparation done. Size: ", len(comments)) comments_file.write("Found {} comments in {} files".format( len(comments), len(comments_data.items()))) comments_file.close() count_vector = CountVectorizer() count_result = count_vector.fit_transform(comments) print("CountVectorizer Array:") print(count_result.toarray()) vector = TfidfVectorizer(analyzer='word', stop_words='english', lowercase=True, max_features=5000, tokenizer=data_cleaning.data_tokenize_clean, strip_accents='ascii', ngram_range=(0, 1)) tfidf_v = vector.fit_transform(comments) print(colorama.Fore.YELLOW + "Number of feature names : ", len(vector.get_feature_names())) print(colorama.Fore.YELLOW + "Number of comments : ", len(comments)) print(colorama.Fore.RESET) print("TF-IDF Vectorizer size : ", len(tfidf_v.toarray())) db = DBSCAN(eps=0.7, min_samples=3).fit(tfidf_v) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0 ) # Remove noise (with cluster id -1) print('Estimated number of clusters: %d' % n_clusters_) cluster_data = {} avg = [] for index, item in enumerate(labels, start=0): if cluster_data.get(item) is None: cluster_data[item] = [] cluster_data[item].append(comments[index]) print("Cluster data prepared") cluster_file = open("comments_cluster_dbscan.txt", "w+") for index, data in cluster_data.items(): if index == -1: continue print(colorama.Fore.YELLOW + "Cluster ", index) cluster_file.write("** [CLUSTER] : {} \n".format(index)) for comment in data: cluster_file.write(" [COMMENT] : " + comment + "\n") print(colorama.Fore.GREEN + comment + "\n") print("TOTAL : ", len(data)) print(colorama.Fore.YELLOW + "** END **\n") cluster_file.write("\n") avg.append(len(data)) cluster_file.close() total = 0 for i in avg: total = total + i print("AVERAGE: ", total / len(avg)) plt.subplot(121) plt.imshow(count_result.toarray()) plt.colorbar() plt.title("CountVectorizer") plt.subplot(122) plt.imshow(tfidf_v.toarray()) plt.colorbar() plt.title("TF-IDF Vectorizer") plt.show()
from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import mean_squared_error, r2_score, f1_score from question_query import create_questions_df from answer_query import create_answers_df from data_cleaning import DataCleaner from model_tester import FindOptimalModels if __name__ == '__main__': numrows = 1e6 print("Connecting and getting ~{}".format(numrows)) q = create_questions_df(numrows) print("Got rows, cleaning data") q_train_dc = DataCleaner(q, questions=True, training=True, simple_regression=True, time_split=True, normalize=True) X, y = q_train_dc.get_clean() default_models = [RandomForestRegressor, GradientBoostingRegressor] param_dict = { 'rf': { 'n_estimators': [50, 100, 5000], 'max_depth': [2, 3, 5] }, 'gbr': { 'learning_rate': [.001, .01, .1, .2], 'max_depth': [2, 3, 5], 'n_estimators': [50, 100, 5000]
def print_valid_entry_count_for_each_col(cleaned_df): missing_value_count_df = DataCleaner.get_missing_value_count(cleaned_df) print_df_row_as_dict(missing_value_count_df.collect()[0])
gps = pd.read_csv('./data/Longitud_Latitud.csv') # Create sub_area categorical with all levels shared # between train and test to avoid errors test['price_doc'] = -99 merged = pd.concat([train, test], axis=0) merged = merged.merge(gps, how='left', on='sub_area') merged['sub_area'] = merged.sub_area.astype('category') train = merged[merged.price_doc != -99] test = merged[merged.price_doc == -99] test.pop('price_doc') macro = pd.read_csv('data/macro.csv', parse_dates=['timestamp']) train = train.merge(macro, how='left', on='timestamp', suffixes=('_train', '_macro')) # Clean dc = DataCleaner(data=train, sample_rate=0.3) data, y = dc.clean() y = np.array(y) y = np.log(y+1) # Train / test split data_train, data_test, y_train, y_test = train_test_split(data, y, random_state=77) house_ids_test = data_test.id # Featurize training data set feat_train = Featurizer() X_train = feat_train.featurize(data_train) # Grid search tune all estimators ms = ModelSelector() print ' # {:s} | X_train shape: {:s}'.format(now(), X_train.shape)
import numpy as np from data_cleaning import DataCleaner from features_engineering import FeatureExtractor from model_selection import ModelSelector from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt plt.interactive(True) if __name__ == '__main__': # read and clean the data dc = DataCleaner() data = dc.clean() # Debug transformations # data.to_csv('./data/debug.csv', index=False, encoding='latin1') # assert False # separate target variable target = data.pop('Target') # train test split data_train, data_test, target_train, target_test = train_test_split( data, target) # featurize data featurizer = FeatureExtractor() X_train = featurizer.featurize(data_train) X_test = featurizer.featurize(data_test) # Convert to numpy arrays y_train = np.array(target_train)
test = pd.read_csv('./data/test.csv') gps = pd.read_csv('./data/Longitud_Latitud.csv') # Create sub_area categorical with all levels shared # between train and test to avoid errors test['price_doc'] = -99 merged = pd.concat([train, test], axis=0) merged = merged.merge(gps, how='left', on='sub_area') merged['sub_area'] = merged.sub_area.astype('category') train = merged[merged.price_doc != -99] train = train.merge(macro, how='left', on='timestamp', suffixes=('_train', '_macro')) dc = DataCleaner(data=train) train, y = dc.clean() y = np.array(y) y = np.log(y + 1) # Featurize training data set feat_train = Featurizer() train = feat_train.featurize(train) print 'train shape', train.shape # # Remove all categorical variables for now # mask = ~(train.dtypes == 'object').values # train = train.iloc[:, mask] # print 'train shape with only numerical features', train.shape