def feature_engineering_solution():
    train=load_data('train.csv')
    test=load_data('test.csv')
    le = preprocessing.LabelEncoder()
    le.fit(train['target'])
    train['target']=le.transform(train['target'])
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train=feature_engineering(train[feature_cols])
    X_test=feature_engineering(test[feature_cols])
    feature_cols= [col for col in X_train.columns]#std 0.607958003167 mean 0.615741311533
    X_train=X_train[feature_cols]
    X_test=X_test[feature_cols]
    y=train['target']
    test_ids=test['id']
    print 'feature_engineering_solution'
    cross_v(get_rf(),X_train.values,y.values)#0.600017926514
Beispiel #2
0
def main():
    df = feature_engineering()
    df_train = df.iloc[0:357, :]
    df_test = df.iloc[357:447, :]

    X = df_train.iloc[:, 1:58]
    y = df_train.iloc[:, 0]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=1 / 4,
                                                        random_state=0)

    # following is RandomForestRegressor, or choose other model in model.py
    RF = RandomForestRegressor(max_depth=2,
                               random_state=0,
                               max_features='sqrt',
                               n_estimators=100)
    RF.fit(X_train, y_train)

    y_pred = RF.predict(X_test)

    print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

    RF.fit(X, y)
    test = df_test.iloc[:, 1:58]
    y_pred = RF.predict(test)

    df = pd.DataFrame(y_pred, columns=['playtime_forever'])
    df['id'] = df.index
    df = df[['id', 'playtime_forever']]
    df.to_csv(r'./result/submission.csv', index=False)
Beispiel #3
0
def get_data(datafile):
    # Load in data
    df = pd.read_json(datafile)
    # Perform data & cleaning and feature engineering
    df = feature_engineering(df)
    # Perform over-sampling of majority class
    df = oversampling(df)
    # Make train test split
    X_train, X_test, y_train, y_test = make_split(df)
    return X_train, X_test, y_train, y_test
Beispiel #4
0
 def on_button_click1(self,df):
     dataset_location=self.entry1.get()
     loc = dataset_location.split("/")
     filename= loc[-1]
     dest_loc = "./"+filename
     shutil.copyfile(dataset_location,dest_loc)
     preprocessing.missing_value_filling(filename)
     df = feature_engineering.feature_engineering(df)
     todo = 'classfication' #nlp module will be sending this
     models.model(df, todo)
Beispiel #5
0
def main(paths):
    df_train = pd.read_csv(paths[0])
    df_test = pd.read_csv(paths[1])
    print('Read {}'.format(paths))
    df_train = df_train.set_index('PassengerId')
    df_test = df_test.set_index('PassengerId')
    df_train = fea_eng.feature_engineering(df_train)
    df_test = fea_eng.feature_engineering(df_test)
    df_train = fea_ext.feature_extraction(df_train)
    df_test = fea_ext.feature_extraction(df_test)
    df_test.Fare = df_test.Fare.fillna(df_test.Fare.median())

    df_train, df_test = fea_ext.process_age(df_train, df_test)
    drop_cols = [
        'Name', 'Ticket', 'Cabin', 'Age', 'Sex', 'Embarked', 'Title', 'Surname'
    ]
    df_train = df_train.drop(drop_cols, axis=1)
    df_test = df_test.drop(drop_cols, axis=1)

    # Save file, added "-processed" as suffix
    df_train.to_csv('./data/train-processed.csv')
    df_test.to_csv('./data/test-processed.csv')
def get_data(datafile):
    ''' Loads raw data from a json file into a pandas data frame, performs
    feature selection, feature engineering, oversampling for the minority class
    and splits the data into training and test sets.
    '''
    # Load raw data
    df = pd.read_json(datafile)
    # Perform data cleaning and feature engineering
    df = feature_engineering(df)
    # Perform over-sampling of majority class
    df = oversampling(df)
    # Make train test split
    X_train, X_test, y_train, y_test = make_split(df)
    return X_train, X_test, y_train, y_test
def feature_selection_solution():
    train=load_data('train.csv')
    test=load_data('test.csv')
    le = preprocessing.LabelEncoder()
    le.fit(train['target'])
    train['target']=le.transform(train['target'])
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train=feature_engineering(train[feature_cols])
    X_test=feature_engineering(test[feature_cols])
    feature_cols=[col for col in X_train.columns if col not in ['mean','std','nonzero','feat_6','feat_82','feat_84']]
    X_train=X_train[feature_cols]
    X_test=X_test[feature_cols]
    print X_train.columns
    y=train['target']
    test_ids=test['id']
    print 'feature_selection_solution'
    cross_v(get_rf(),X_train.values,y.values)# mean 0.595288515439   std 0.593551044059 nonzero  0.597406303207
    #no fg 6 82 84 0.603600594376
    #0.600058535601
    clf=get_rf()
    clf.fit(X_train,y)
    preds = clf.predict_proba(X_test)
    write_submission(test_ids,preds,'submissions/feature_selection_rf100_84_82_6_nofg.csv')
Beispiel #8
0
from feature_engineering import feature_engineering
from feature_selection import feature_selection
from Models import linear_model,xgb_model
import argparser
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

if __name__ == '__main__':

	parser=argparse.ArgumentParser()
	parser.add_argument('--train_dataset',help='address of train dataset')
	parser.add_argument('--test_dataset',help='address of test dataset')
	parser.add_argument('--model',help='model')

	train_dataset = parser.train_dataset
	test_dataset  = parser.test_dataset
	model=parser.model
	feature_engineering(train_dataset,test_dataset)
	feature_selection()


	if model=='linear':
		linear_model()
	if model=='xbg':
		xgb_model()

	elif:
		linear_model()
		xgb_model()

from tools import load_data
from sklearn.metrics import log_loss
from sklearn.calibration import CalibratedClassifierCV
from feature_engineering import feature_engineering
from sklearn import cross_validation
from tools import cross_v
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def get_rf():
    forest=ensemble.RandomForestClassifier(n_estimators=100)
    return forest
    
train=load_data('train.csv')
feature_cols = [col for col in train.columns if col not in  ['id','target']] 
X_train=feature_engineering(train[feature_cols]).values
y=train['target'].values
X_train, X_test, y_train, y_test=cross_validation.train_test_split(X_train,y,test_size=0.33,random_state=1)

skf = cross_validation.StratifiedKFold(y_train, n_folds=10, random_state=42)
calibration_method = 'isotonic'
clf=get_rf()
ccv = CalibratedClassifierCV(base_estimator=clf, method=calibration_method, cv=skf)

#ccv.fit(X_train,y_train)
#pred=ccv.predict_proba(X_test)
clf.fit(X_train,y_train)
pred=clf.predict_proba(X_test)
score=log_loss(y_test,pred)
#0.487707826761
Beispiel #10
0
    # fill_null_values : fill no-value to the column model
    fill_null_values(updated_data, 'model', 'no-value')
    # eds : barchart for model
    bar_chart(updated_data['model'], 'model')
    # describe-data: display null counts
    display_null_counts(updated_data)
    # -----------------------------------------------------------
    # eds : barchart for fuelType
    bar_chart(updated_data['fuelType'], 'fuelType')
    # fill_null_values : fill no-value to the column fuelType
    fill_null_values(updated_data, 'fuelType', 'benzin')
    # eds : barchart for fuelType
    bar_chart(updated_data['fuelType'], 'fuelType')
    # describe-data: display null counts
    display_null_counts(updated_data)

    #------------------------------FEATURE Engineering------------------------------#
    updated_data = feature_engineering(updated_data)

    #-----------------------------Prepare Data for Training------------------------#
    x_train, x_test, y_train, y_test = prepare_data(updated_data, 'price')

    # -----------------------------Modelling-Random Forests-------------------------#
    model_random_forests(x_train, y_train, x_test, y_test)

    # -----------------------------Modelling-Linear Regression-------------------------#
    model_linear_regression(x_train, y_train, x_test, y_test)

    print(
        '\n\n#----------------------THE END OF THE PROJECT----------------------#'
    )
import argparse
import numpy as np
import pandas as pd
import xgboost as xgb
import pickle
from feature_engineering import feature_engineering

# Parsing script arguments
parser = argparse.ArgumentParser(description='Process input')
parser.add_argument('tsv_path', type=str, help='tsv file path')
args = parser.parse_args()

# Reading input TSV
data = pd.read_csv(args.tsv_path, sep="\t")
data_X, data_Y = feature_engineering(data, test=True)

# Load model:
model = xgb.XGBRegressor()
with open(f"models/{model.__class__.__name__}.pkl", 'rb') as f:
    model = pickle.load(f)

# Prediction:
pred = np.expm1(model.predict(data_X))

prediction_df = pd.DataFrame(columns=['id', 'revenue'])
prediction_df['id'] = data['id']
prediction_df['revenue'] = pred

# Export prediction results
prediction_df.to_csv("prediction.csv", index=False, header=False)
Beispiel #12
0
import pandas as pd  
from feature_engineering import feature_engineering
from Modeling1 import Modeling1

    
DATA_PATH = r'C:\Users\Davis\Desktop\Dat and aud Hotel'
dft = pd.read_csv(DATA_PATH+r'\train.csv', header=0) 


#functions recall
X_train,y_train =feature_engineering(dft)
feature_imp=Modeling1(X_train,y_train)










import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from feature_engineering import feature_engineering,\
test_train_split, create_X_and_y

def read_data(filename):
	df = read_json(filename)

if __name__ if __name__ == '__main__':
	#need to figure out how to generalize load data
	df = read_data(filename)

	#run data through pipeline for features and transformations
	df_features = feature_engineering(df)
	
	#Create X and y for splits, models
	X, y = create_X_and_y(df_features)

	#Create test and training sets
	X_train, X_val, y_train, y_val = test_train_split(X, y)
	
	#Instantiate scaler, scale data for Naive Bayes model
	min_max = MinMaxScaler()
	min_max.fit(X_train)
	X_scaled = min_max.transform(X_train)

	#Instantiate, fit Naive Bayes model
	nb = MultinomialNB()
	nb.fit(scaled_data, y_train)
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor

from data_loading import read_co_data
from feature_engineering import feature_engineering
from model_evaluation import model_evaluation
from model_visualization import model_visualization

if __name__ == '__main__':
    # 读取原始数据
    raw_data = read_co_data()
    # 特征工程
    fed_data = feature_engineering(raw_data)
    # feature vector
    X = fed_data.take(list(range(fed_data.shape[1] - 1)), axis=1)
    # target
    y = np.ravel(fed_data.take([fed_data.shape[1] - 1], axis=1))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    # 定义一个BP神经网络
    reg = MLPRegressor(solver='lbfgs',
                       alpha=1e-5,
                       hidden_layer_sizes=(15, ),
                       random_state=1)
    # 训练
def main():
    train=load_data('train.csv')
    feature_cols= [col for col in train.columns if col  not in ['target','id']]
    X_train=feature_engineering(train[feature_cols])
    y=train['target']
    grid_search(X_train,y,get_clfs())
def train(fold):
    df = pd.read_csv(config.training_file)

    ## fill all NaNs
    df[df == ' ?'] = np.nan  # the dataset has labelled all NaNs with ? already

    ## label encoding
    ### define numerical columns
    num_feas = [
        'age', 'wage per hour', 'capital gains', 'capital losses',
        'dividends from stocks', 'num persons worked for employer',
        'own business or self employed', 'weeks worked in year', 'kfold', 'y'
    ]

    ## only for this competition, mapping them to 0 and 1 as suggested by the organizer
    df.y = df.y.map({' - 50000.': 0, ' 50000+.': 1})

    ## define all categorical features
    cat_feas = [i for i in df.columns if i not in num_feas]

    ## call the feature engineering function for categorical features
    df = feature_engineering(df, cat_feas)

    ## all features
    features = [i for i in df.columns if i not in ('kfold', 'y')]

    ## fill all NaNs with NONE
    for col in features:
        if col not in num_feas:
            df.loc[:, col] = df[col].astype(str).fillna(
                'NONE')  # fill all NaNs wiht None

    ## label encoding each column
    ## add each encoder to the dictionary
    encoder = {}
    for col in features:
        if col not in num_feas:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(df[col])
            df.loc[:, col] = lbl.transform(df[col])
            encoder[col] = lbl

    ## create data for training and validation
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    ## prepare data for training
    x_train = df_train.drop(['kfold', 'y'], axis=1).values
    y_train = df_train[config.target].values

    ## similarly, we prepare data for testing
    x_valid = df_valid.drop(['kfold', 'y'], axis=1).values
    y_valid = df_valid[config.target].values

    ## initialize a model
    model = xgb.XGBClassifier(n_jobs=-1)

    print('Training starting!!!')
    ## fit
    model.fit(x_train, y_train)

    ## predict on validation dataset
    valid_preds = model.predict_proba(x_valid)[:, 1]

    ## get roc auc score
    auc = metrics.roc_auc_score(y_valid, valid_preds)

    ## print auc
    print(f"Fold = {fold}, AUC = {auc}")

    ## save the model
    joblib.dump(model,
                os.path.join(config.model_output, f'xgb_fe_fold{fold}.bin'))

    ## save the columns used to fit the model
    joblib.dump(
        df_train.drop(['kfold', 'y'], axis=1).columns,
        os.path.join(config.model_output, f'xgb_fe_cols_fold{fold}.pkl'))

    ## save the label encoder
    joblib.dump(
        encoder,
        os.path.join(config.model_output, f'xgb_fe_encoder_fold{fold}.pkl'))
from training import train, run_kaggle_submission
import constants as CN

# Parse arguments
parser = argparse.ArgumentParser()
# If the argument is present, throw the log in the "trash" folder : for testing phases
parser.add_argument("-t", action="store_false")
# Specify the number of epochs
parser.add_argument("-n", default=100, type=int)
# Say whether or not the results should be submitted to Kaggle
parser.add_argument("--kaggle", action="store_true", default=False)
args = parser.parse_args()

# Load an preprocess data
train_df = pd.read_csv(CN.TRAIN_FILE)
train_df = feature_engineering(train_df)
train_loader, val_loader = get_dataloaders(train_df=train_df)

test_df = pd.read_csv(CN.TEST_FILE)
test_df = feature_engineering(test_df)
test_df, test_tensors = get_test_data(test_df)

lr = 0.001
n_epochs = args.n
dropout_rate = 0.0
regul = 0.00
submit_to_kaggle = args.kaggle
if submit_to_kaggle:
    print("Kaggle submission is enabled")
else:
    print("Kaggle submission is disabled")
Beispiel #18
0
def semi_sup_cf():
    # df_review = pd.read_csv("./Datasets/labeled_reviews.csv")
    # df_tweet = preprocess_file("./Datasets/test.csv")
    # df_tweet = df_tweet.rename(columns={"tweet": "comment"})
    # df_review = df_review[['comment', 'class']]
    # df = pd.concat([df_tweet, df_review])

    # freq_inverted = create_inverted_frequency_dict(df, "comment")
    # json_dict = json.dumps(freq_inverted)
    # f = open("dict_inverted.json", "w")
    # f.write(json_dict)
    # f.close()

    with open('./dict_inverted.json') as f:
        freq_inverted = json.load(f)
        f.close()

    for classifier, name in zip(classifiers, classifiers_names):
        model = make_pipeline(classifier)

        df_labeled_train, df_labeled_test = train_test_balanced_reviews_tweets(
            1000, 1000)
        df_labeled_train = feature_engineering(df_labeled_train, "comment",
                                               freq_inverted)
        df_labeled_test = feature_engineering(df_labeled_test, "comment",
                                              freq_inverted)

        df_unlabeled = pd.read_csv("unlabeled_reviews.csv")
        df_unlabeled = feature_engineering(df_unlabeled, "comment",
                                           freq_inverted)

        high_prob = [1]
        i = 0
        while True:
            model.fit(
                df_labeled_train[[
                    'spoken', "rarity", "meanings", "lexical", "emoticon"
                ]].to_numpy(), df_labeled_train["class"].to_numpy())
            predicted_categories = model.predict(df_unlabeled[[
                'spoken', "rarity", "meanings", "lexical", "emoticon"
            ]].to_numpy())
            predicted_categories_prob = model.predict_proba(df_unlabeled[[
                'spoken', "rarity", "meanings", "lexical", "emoticon"
            ]].to_numpy())

            prob_false = predicted_categories_prob[:, 0]
            prob_true = predicted_categories_prob[:, 1]

            df_prob = pd.DataFrame([])
            df_prob['predicted'] = predicted_categories
            df_prob['prob_false'] = prob_false
            df_prob['prob_true'] = prob_true
            df_prob.index = df_unlabeled.index

            high_prob = pd.concat([
                df_prob.loc[df_prob['prob_false'] > 0.99],
                df_prob.loc[df_prob['prob_true'] > 0.99]
            ],
                                  axis=0)

            pseudos = df_unlabeled.loc[high_prob.index]
            pseudos["class"] = high_prob['predicted']
            df_labeled_train = pd.concat([
                df_labeled_train, pseudos[[
                    'spoken', "rarity", "meanings", "lexical", "emoticon",
                    'class'
                ]]
            ],
                                         axis=0)
            df_unlabeled = df_unlabeled.drop(index=high_prob.index)

            if len(df_unlabeled) == 0 or len(high_prob) == 0:
                test = model.predict(df_labeled_test[[
                    'spoken', "rarity", "meanings", "lexical", "emoticon"
                ]].to_numpy())
                report = classification_report(
                    df_labeled_test["class"].to_numpy(),
                    test,
                    output_dict=True)
                plot(report, f"{name}_cf.png", f"{name} with custom features")
                joblib.dump(model, f"{name}_cf.sav")
                break

            i += 1