def train_and_predict(train_df, test_df):

    # Data Cleaning
    # clean the data
    cleaner = DataCleaner()
    cleaner.columns_with_no_nan(train_df)
    cleaner.columns_with_no_nan(test_df)
    train_df = cleaner.drop_columns(train_df)
    train_df = cleaner.resolve_nan(train_df)
    test_df = cleaner.drop_columns(test_df)
    test_df = cleaner.resolve_nan(test_df)


    # features engineering
    train_df, test_df = engineer_features(train_df, test_df)

    # train the model from Model
    model = Classifier()
    model = model.model()

    # LabelEncoding/OneHotEncoding?
    train_df = model.encode(train_df)
    test_df = model.encode(test_df)

    # training progress and results
    model = model.train(model, train_df)

    # predict on test_df with predict method from Model
    y_test = model.predict(model, test_df)
    return y_test
def get_cleaned_data(read_cleaned_csv, file_name):
    if file_name == "":
        file_name = Constants.FILENAME
    if not read_cleaned_csv:
        df_311 = utilFor311.read_data_from_csv(file_name)
        df_311 = DataCleaner.remove_space_from_col_names(df_311)
        df_311 = DataCleaner.drop_unwanted_cols(df_311, Constants.DROP_COLS1)
        df_311 = DataCleaner.drop_below_threshold(df_311)
        df_311 = DataCleaner.filter_frequent_request_types(df_311)
        df_311 = DataCleaner.capitalize_cols(df_311, Constants.CAPITALIZE_COLS)
        df_311 = DataCleaner.format_zip_code(df_311)
        df_311 = DataCleaner.update_burrow_city_from_zip_code(df_311)
        df_311 = DataCleaner.drop_empty_null_values(df_311)
        df_311 = DataCleaner.calculate_time_to_resolve_in_seconds(df_311)
        cleaned_df = DataCleaner.create_separate_day_month_year_col(df_311)
        cleaned_df = utilFor311.rearrange_cols(cleaned_df)
    else:
        cleaned_df = utilFor311.read_data_from_csv(file_name)
    return cleaned_df
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, f1_score

from question_query import create_questions_df
from answer_query import create_answers_df
from data_cleaning import DataCleaner
from model_tester import FindOptimalModels


if __name__ == '__main__':
    numrows = 1e6
    print("Connecting and getting ~{}".format(numrows))
    a = create_answers_df(numrows)
    print("Got rows, cleaning data")
    a_train_dc = DataCleaner(a, questions=False, training=True,
                             simple_regression=True, time_split=False,
                             normalize=False)
    A, b = a_train_dc.get_clean()

    default_models = [RandomForestRegressor, GradientBoostingRegressor]

    param_dict = {'rf': {'n_estimators': [50, 100, 5000], 'max_depth':
                  [2, 3, 5]},
                  'gbr': {'learning_rate': [.001, .01, .1, .2], 'max_depth':
                          [2, 3, 5], 'n_estimators': [50, 100, 5000]}}
    print('Finding optimal models')
    finder = FindOptimalModels(A, b, question=False, time_split=False)
    finder.baseline_model()
    fitted_models = finder.run_default_models(default_models)
    print("starting grid search")
    opt_params = finder.run_grid_search(fitted_models, param_dict)
def main():
    colorama.init()
    dir_path = os.path.dirname(os.path.realpath(__file__))
    inspector = JavaFileInspector("{}/comment_resources".format(dir_path))

    comments_data = inspector.get_comments()
    comments_file = open("comments.txt", "w+")
    comments = []
    print("Comment data obtained")

    for key, value in comments_data.items():
        print(colorama.Fore.GREEN + "[FILE] : " + key)
        comments_file.write("** [FILE] : " + key + "\n")

        for item in value:
            if item.comment not in comments:
                if len(item.comment.split()) > 10:
                    if not DataCleaner.check_if_copyright(item.comment):
                        print(colorama.Fore.YELLOW + "      [COMMENT] : " +
                              item.comment + "\n")
                        comments_file.write("      [COMMENT] : " +
                                            item.comment + "\n")
                        comments.append(item.comment)
        comments_file.write("\n")

    print(colorama.Fore.RESET)
    print("Corpus preparation done. Size: ", len(comments))
    comments_file.write("Found {} comments in {} files".format(
        len(comments), len(comments_data.items())))
    comments_file.close()

    count_vector = CountVectorizer()
    count_result = count_vector.fit_transform(comments)
    print("CountVectorizer Array:")
    print(count_result.toarray())

    vector = TfidfVectorizer(analyzer='word',
                             stop_words='english',
                             lowercase=True,
                             max_features=5000,
                             tokenizer=data_cleaning.data_tokenize_clean,
                             strip_accents='ascii',
                             ngram_range=(0, 1))
    tfidf_v = vector.fit_transform(comments)

    print(colorama.Fore.YELLOW + "Number of feature names : ",
          len(vector.get_feature_names()))
    print(colorama.Fore.YELLOW + "Number of comments : ", len(comments))
    print(colorama.Fore.RESET)

    print("TF-IDF Vectorizer size : ", len(tfidf_v.toarray()))

    db = DBSCAN(eps=0.7, min_samples=3).fit(tfidf_v)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0
                                      )  # Remove noise (with cluster id -1)

    print('Estimated number of clusters: %d' % n_clusters_)

    cluster_data = {}
    avg = []
    for index, item in enumerate(labels, start=0):
        if cluster_data.get(item) is None:
            cluster_data[item] = []

        cluster_data[item].append(comments[index])

    print("Cluster data prepared")
    cluster_file = open("comments_cluster_dbscan.txt", "w+")

    for index, data in cluster_data.items():
        if index == -1:
            continue

        print(colorama.Fore.YELLOW + "Cluster ", index)
        cluster_file.write("** [CLUSTER] : {} \n".format(index))
        for comment in data:
            cluster_file.write("      [COMMENT] : " + comment + "\n")
            print(colorama.Fore.GREEN + comment + "\n")

        print("TOTAL : ", len(data))
        print(colorama.Fore.YELLOW + "** END **\n")
        cluster_file.write("\n")
        avg.append(len(data))

    cluster_file.close()

    total = 0
    for i in avg:
        total = total + i

    print("AVERAGE: ", total / len(avg))

    plt.subplot(121)
    plt.imshow(count_result.toarray())
    plt.colorbar()
    plt.title("CountVectorizer")

    plt.subplot(122)
    plt.imshow(tfidf_v.toarray())
    plt.colorbar()
    plt.title("TF-IDF Vectorizer")

    plt.show()
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, f1_score

from question_query import create_questions_df
from answer_query import create_answers_df
from data_cleaning import DataCleaner
from model_tester import FindOptimalModels

if __name__ == '__main__':
    numrows = 1e6
    print("Connecting and getting ~{}".format(numrows))
    q = create_questions_df(numrows)
    print("Got rows, cleaning data")
    q_train_dc = DataCleaner(q,
                             questions=True,
                             training=True,
                             simple_regression=True,
                             time_split=True,
                             normalize=True)
    X, y = q_train_dc.get_clean()

    default_models = [RandomForestRegressor, GradientBoostingRegressor]

    param_dict = {
        'rf': {
            'n_estimators': [50, 100, 5000],
            'max_depth': [2, 3, 5]
        },
        'gbr': {
            'learning_rate': [.001, .01, .1, .2],
            'max_depth': [2, 3, 5],
            'n_estimators': [50, 100, 5000]
def print_valid_entry_count_for_each_col(cleaned_df):
    missing_value_count_df = DataCleaner.get_missing_value_count(cleaned_df)
    print_df_row_as_dict(missing_value_count_df.collect()[0])
Esempio n. 7
0
    gps = pd.read_csv('./data/Longitud_Latitud.csv')
    # Create sub_area categorical with all levels shared
    # between train and test to avoid errors
    test['price_doc'] = -99
    merged = pd.concat([train, test], axis=0)
    merged = merged.merge(gps, how='left', on='sub_area')
    merged['sub_area'] = merged.sub_area.astype('category')
    train = merged[merged.price_doc != -99]
    test = merged[merged.price_doc == -99]
    test.pop('price_doc')

    macro = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
    train = train.merge(macro, how='left', on='timestamp', suffixes=('_train', '_macro'))

    # Clean
    dc = DataCleaner(data=train, sample_rate=0.3)
    data, y = dc.clean()
    y = np.array(y)
    y = np.log(y+1)

    # Train / test split
    data_train, data_test, y_train, y_test = train_test_split(data, y, random_state=77)
    house_ids_test = data_test.id

    # Featurize training data set
    feat_train = Featurizer()
    X_train = feat_train.featurize(data_train)

    # Grid search tune all estimators
    ms = ModelSelector()
    print ' # {:s} | X_train shape: {:s}'.format(now(), X_train.shape)
import numpy as np
from data_cleaning import DataCleaner
from features_engineering import FeatureExtractor
from model_selection import ModelSelector
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
plt.interactive(True)

if __name__ == '__main__':
    # read and clean the data
    dc = DataCleaner()
    data = dc.clean()

    # Debug transformations
    # data.to_csv('./data/debug.csv', index=False, encoding='latin1')
    # assert False

    # separate target variable
    target = data.pop('Target')

    # train test split
    data_train, data_test, target_train, target_test = train_test_split(
        data, target)

    # featurize data
    featurizer = FeatureExtractor()
    X_train = featurizer.featurize(data_train)
    X_test = featurizer.featurize(data_test)

    # Convert to numpy arrays
    y_train = np.array(target_train)
    test = pd.read_csv('./data/test.csv')
    gps = pd.read_csv('./data/Longitud_Latitud.csv')
    # Create sub_area categorical with all levels shared
    # between train and test to avoid errors
    test['price_doc'] = -99
    merged = pd.concat([train, test], axis=0)
    merged = merged.merge(gps, how='left', on='sub_area')
    merged['sub_area'] = merged.sub_area.astype('category')
    train = merged[merged.price_doc != -99]

    train = train.merge(macro,
                        how='left',
                        on='timestamp',
                        suffixes=('_train', '_macro'))

    dc = DataCleaner(data=train)
    train, y = dc.clean()
    y = np.array(y)
    y = np.log(y + 1)

    # Featurize training data set
    feat_train = Featurizer()
    train = feat_train.featurize(train)

    print 'train shape', train.shape

    # # Remove all categorical variables for now
    # mask = ~(train.dtypes == 'object').values
    # train = train.iloc[:, mask]
    # print 'train shape with only numerical features', train.shape