indices = np.argsort(importances)[::-1] # Print the feature ranking #print 'Model score:', RFC.score(Feats_test, Labels_test) print("Feature ranking:") for f in range(0, 50): print("%d. feature %s (%f)" % (f + 1, Feats.columns[indices[f]], importances[indices[f]])) return RFC #AgeGender = pd.read_csv('age_gender_bkts.csv') #Countries = pd.read_csv('countries.csv') #Sessions = pd.read_csv('sessions.csv') #For now drop 'date_first_booking' - use it later with sessions maybe Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1) Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1) Train_Feats = bf.getFeats(Train, 'Train') Test_Feats = bf.getFeats(Test, 'Test') ## # Things that appear in the training set and not in the test set: add a column of all zeros for col in Train_Feats.columns: if(col not in Test_Feats.columns): Test_Feats[col] = np.zeros(Test_Feats['id'].shape) # Things that appear in the test set and not in the training set: remove these columns for col in Test_Feats.columns: if(col not in Train_Feats.columns): print 'Dropping:', col Test_Feats.drop(col, axis = 1, inplace = True) # Get rid of the columns left after the sessions join Train_Feats.drop(['user_id_x', 'user_id_y'], axis = 1, inplace = True)
def readDataMain(FOLDER, skip): warnings.filterwarnings("ignore") # For now drop 'date_first_booking' - use it later with sessions maybe Train = pd.read_csv("train_users_2.csv").drop("date_first_booking", axis=1) Test = pd.read_csv("test_users.csv").drop("date_first_booking", axis=1) Train_Feats = bf.getFeats(Train, "Train", FOLDER) Test_Feats = bf.getFeats(Test, "Test", FOLDER) # Things that appear in the training set and not in the test set: add a column of all zeros for col in Train_Feats.columns: if col not in Test_Feats.columns: Test_Feats[col] = np.zeros(Test_Feats["id"].shape) # Things that appear in the test set and not in the training set: remove these columns for col in Test_Feats.columns: if col not in Train_Feats.columns: print "Dropping:", col Test_Feats.drop(col, axis=1, inplace=True) # Get rid of the columns left after the sessions join # Only do this when your merging - comment out when using only BasicInfo if "user_id_x" in Train_Feats.columns: Train_Feats.drop(["user_id_x", "user_id_y"], axis=1, inplace=True) Test_Feats.drop(["user_id_x", "user_id_y"], axis=1, inplace=True) # Sometimes happens to be there for some reason... if "user_id" in Train_Feats.columns: Train_Feats.drop("user_id", axis=1, inplace=True) if "user_id" in Test_Feats.columns: Test_Feats.drop("user_id", axis=1, inplace=True) Train_Feats = Train_Feats.sort(axis=1) Test_Feats = Test_Feats.sort(axis=1) Train_Feats.fillna(-1, inplace=True) Test_Feats.fillna(-1, inplace=True) # Array of all possible labels Labels = ["AU", "CA", "DE", "ES", "FR", "GB", "IT", "NDF", "NL", "PT", "US", "other"] l = "TargetIs" LabelIndex = [l + x for x in Labels] print "Total number of labels:", len(LabelIndex) Remove = LabelIndex + ["TargetIsNull"] print "Number of features in Train:", len(Train_Feats.columns) - 1 print "Number of features in Test:", len(Test_Feats.columns) - 1 # FOLDER = 'BasicInfoOnly' # To save RAM write Test_Feats and Train_Feats to CSV and delete all DataFrames from memory # Test_Feats.to_csv(FOLDER + '/Test_Feats.csv', index = False) # Train_Feats.to_csv(FOLDER + '/Train_Feats.csv', index = False) del Test del Train # del Test_Feats # del Train_Feats # Train_Feats_Only = Train_Feats.loc[~((Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3)), :] # 0.7 of data for first layer, 0.3 for second layer X_train, X_test, y_train, y_test = train_test_split( Train_Feats.drop(Remove, axis=1), Train_Feats.loc[:, LabelIndex], test_size=0.5 ) # X_test = X_test.append(Train_Feats.loc[(Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3), :].drop(Remove, axis = 1)) # y_test = y_test.append(Train_Feats.loc[(Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3), LabelIndex]) X_test["IsTest"] = ((X_test["FirstActiveYear"].isin([2014])) & (X_test["FirstActiveMonth"] > 3)).astype(int) y_test["IsTest"] = X_test["IsTest"] del Train_Feats # del Test_Feats fa = "FirstActive" FirstActiveDrop = [ fa + "Year", fa + "Month", fa + "DayOfMonth", fa + "WeekOfYear", fa + "DayOfWeek", fa + "Quarter", fa + "Hour", ] # Dropping this in most cases cause was added only for the train/test split if "BasicInfo" in FOLDER: X_train.drop(FirstActiveDrop, axis=1) X_test.drop(FirstActiveDrop, axis=1) # Again, save these to files rather than keep them in RAM # X_test.reset_index(drop = True).to_csv(FOLDER + '/X_test.csv', index = False) # del X_test # y_train.reset_index(drop = True).to_csv(FOLDER + '/y_train.csv', index = False) # del y_train # y_test.reset_index(drop = True).to_csv(FOLDER + '/y_test.csv', index = False) # X_train.reset_index(drop = True).to_csv(FOLDER + '/X_train.csv', index = False) # del X_train # X_train = pd.read_csv(FOLDER + '/X_train.csv') # y_train = pd.read_csv(FOLDER + '/y_train.csv') # X_test = pd.read_csv(FOLDER + '/X_test.csv') # Test_Feats = pd.read_csv(FOLDER + '/Test_Feats.csv') y_test.to_csv(FOLDER + "/IntTestLabels.csv") del y_test # Train a separate classifier for each label # for li in LabelIndex: # if(LabelIndex.index(li) == 7): # Probs = pd.DataFrame() # Ext_Test_Probs = pd.DataFrame() # print datetime.datetime.now().time() # print 'Training model no', LabelIndex.index(li) # # enrich models in their data of interest # Enriched_X_train, Enriched_y_train = enrich(pd.read_csv(FOLDER + '/X_train.csv'), pd.read_csv(FOLDER + '/y_train.csv'), li) # # Only for reference - for scoring models # #Brain = trainModel(Enriched_X_train.drop('id', axis = 1), Enriched_y_train[li], X_test.drop('id', axis = 1), y_test[li]) # Brain = trainModel(Enriched_X_train.drop('id', axis = 1), Enriched_y_train[li], None, None) # print datetime.datetime.now().time() # print 'Getting probs for model no', LabelIndex.index(li) # Probs['id'] = pd.read_csv(FOLDER + '/X_test.csv')['id'] # Probs[li] = Brain.predict_proba(pd.read_csv(FOLDER + '/X_test.csv').drop('id', axis = 1))[:, 1] # # External data # Ext_Test_Probs['id'] = pd.read_csv(FOLDER + '/Test_Feats.csv')['id'] # Ext_Test_Probs[str(li)] = Brain.predict_proba(pd.read_csv(FOLDER + '/Test_Feats.csv').drop('id', axis = 1).drop(Remove, axis = 1))[:, 1] # FileNameIndex = LabelIndex.index(li) # Probs.to_csv(FOLDER + '/IntTestProbs' + str(FileNameIndex) + '.csv') # Ext_Test_Probs.to_csv(FOLDER + '/ExtTestProbs' + str(FileNameIndex) + '.csv') # Train classifier for all 2-label combinations Freqs = [0.27, 0.66, 0.51, 1.06, 2.50, 1.02, 1.15, 58.34, 0.42, 0.06, 29.55, 4.46] Repeat = [10, 10, 10, 10, 7, 10, 10, 1, 10, 10, 4, 6] for li1 in LabelIndex: for li2 in LabelIndex: if li1 is not li2: for i in range(0, 2): # for i in range(0, int(np.true_divide(Freqs[LabelIndex.index(li2)], 2 * Freqs[LabelIndex.index(li1)])) + 1): if skip: if LabelIndex.index(li1) < 7 or (LabelIndex.index(li1) == 7 and LabelIndex.index(li2) < 2): continue name = li1 + "with" + li2 + str(i + 1) Probs = pd.DataFrame() Ext_Test_Probs = pd.DataFrame() print datetime.datetime.now().time() print "Training model", name # enrich models in their data of interest # Enriched_X_train, Enriched_y_train = enrichPairwise(pd.read_csv(FOLDER + '/X_train.csv'), pd.read_csv(FOLDER + '/y_train.csv'), li1, li2) Enriched_X_train, Enriched_y_train = enrichPairwise(X_train, y_train, li1, li2) Brain = trainModel(Enriched_X_train.drop("id", axis=1), Enriched_y_train[li1], None, None) print datetime.datetime.now().time() print "Getting probs for model no", name # Probs['id'] = pd.read_csv(FOLDER + '/X_test.csv')['id'] Probs["id"] = X_test["id"] # Probs[name] = Brain.predict_proba(pd.read_csv(FOLDER + '/X_test.csv').drop('id', axis = 1))[:, 1] Probs[name] = Brain.predict_proba(X_test.drop(["id", "IsTest"], axis=1))[:, 1] Probs["IsTest"] = X_test["IsTest"] FileNameIndex = str(LabelIndex.index(li1)) + "with" + str(LabelIndex.index(li2)) + "no" + str(i + 1) Probs.to_csv(FOLDER + "/IntTestProbs" + FileNameIndex + ".csv") # External data # Ext_Test_Probs['id'] = pd.read_csv(FOLDER + '/Test_Feats.csv')['id'] Ext_Test_Probs["id"] = Test_Feats["id"] Ext_Test_Probs[name] = Brain.predict_proba(Test_Feats.drop("id", axis=1).drop(Remove, axis=1))[:, 1] Ext_Test_Probs.to_csv(FOLDER + "/ExtTestProbs" + FileNameIndex + ".csv") gc.collect() print "Probs written to CSV" print datetime.datetime.now().time()
def blenderXGMain(FOLDER, TOP, fraction): RAW = 'RawResults/' NAME = RAW + '100FeatsEach0.4sample1' print 'Starting', datetime.datetime.now().time() warnings.filterwarnings("ignore") N_guesses = 5 N_trees = 100 ExtTest = pd.DataFrame() IntTest = pd.DataFrame() ExtResult = pd.DataFrame(columns = ('id', 'country')) Labels = ['TargetIsAU', 'TargetIsCA', 'TargetIsDE', 'TargetIsES', 'TargetIsFR', 'TargetIsGB', 'TargetIsIT', 'TargetIsNDF', 'TargetIsNL', 'TargetIsPT', 'TargetIsUS', 'TargetIsother'] Countries = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other'] # Deal with the files being read in 'alphabetical' order Twisted_names = ['AU', 'CA', 'US', 'other', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT'] Chosen_Feats = [] for i in range(0, len(FOLDER)): feats = pd.read_csv(RAW + '/' + FOLDER[i] + 'FeatImp.csv', nrows = TOP[i])['0'].tolist() # feats = [x + '_' + str(i) for x in feats] Chosen_Feats.append(feats) os.chdir('/Users/piotrgrudzien/Desktop/Airbnb') Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1) Train = Train.sample(frac = fraction).reset_index(drop = True) Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1) cf = [] for f in FOLDER: cf = Chosen_Feats[FOLDER.index(f)] + ['id'] # Need to add targets to train if(f is FOLDER[-1]): for col in Train_Feats.columns: if('Target' in col): cf.append(col) # remove duplicates # cfe = list(set(cfe)) if(FOLDER.index(f) == 0): Train_Feats = bf.getFeats(Train, 'Train', f)[cf] else: if(f is FOLDER[-1]): Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f)[cf + ['FirstActiveYear', 'FirstActiveMonth'] + Labels], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra')) else: Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f)[cf], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra')) if(FOLDER.index(f) == 0): Test_Feats = bf.getFeats(Test, 'Test', f) for col in cf: if(col not in Test_Feats.columns): Test_Feats[col] = np.zeros(Test_Feats['id'].shape) Test_Feats = Test_Feats[cf] else: Taken = bf.getFeats(Test, 'Test', f) # Things that appear in the training set and not in the test set: add a column of all zeros for col in cf: if(col not in Taken.columns): Test_Feats[col] = np.zeros(Test_Feats['id'].shape) Test_Feats = pd.merge(Test_Feats, Taken[cf], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra')) print 'Added folder', f, 'features:', len(Test_Feats.columns) - 1 # Drop all repeated feats for col in Train_Feats.columns: if('extra' in col): if(col in Train_Feats.columns): Train_Feats.drop(col, axis = 1, inplace = True) for col in Test_Feats.columns: if('extra' in col): if(col in Test_Feats.columns): Test_Feats.drop(col, axis = 1, inplace = True) # Drop all the 'target' feats IntLabels = pd.DataFrame() IntLabels['id'] = Train_Feats['id'] for col in Train_Feats.columns: if('Target' in col): IntLabels[col] = Train_Feats[col] Train_Feats.drop(col, axis = 1, inplace = True) # And add the true label Train_Feats['Target'] = Train['country_destination'] Train_Feats = Train_Feats.sort(axis = 1) Test_Feats = Test_Feats.sort(axis = 1) Train_Feats.fillna(-1, inplace = True) Test_Feats.fillna(-1, inplace = True) # Array of all possible labels Labels = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other'] l = 'TargetIs' LabelIndex = [l + x for x in Labels] print 'Total number of labels:', len(LabelIndex) Remove = LabelIndex + ['TargetIsNull'] Train_Feats['IsTest'] = ((Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3)).astype(int) # Only include FirstActive in BasicInfo if('BasicInfo' not in FOLDER): for col in ['FirstActiveYear', 'FirstActiveMonth']: if(col in Train_Feats.columns): Train_Feats.drop(col, axis = 1, inplace = True) if(col in Test_Feats.columns): Test_Feats.drop(col, axis = 1, inplace = True) ############# IntLabels['IsTest'] = Train_Feats['IsTest'] IntLabels['country'] = IntLabels.apply(getCountryID, axis = 1) IntTest = Train_Feats.drop('Target', axis = 1) ExtTest = Test_Feats print 'Number of features in Train:', len(IntTest.columns) - 2 print 'Number of features in Test:', len(ExtTest.columns) - 1 RFC= getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME, lr=0.01) RFC= getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME, lr=0.1) print 'Second layer RFC trained' del IntTest Probs = RFC.predict_proba(ExtTest.drop('id', axis = 1)) pd.DataFrame(Probs).to_csv(NAME + 'ExtProbs.csv', index = False) print 'Probs written to CSV' id_test = ExtTest['id'] le = LabelEncoder() ids = [] cts = [] le.fit(Twisted_names) for i in range(len(Probs)): df = pd.DataFrame(columns = range(0, 12)) df.loc[i, :] = Probs[i, :] Trans1 = df.apply(getReverseOrder, axis = 1) Trans2 = Trans1[Trans1.columns[::-1][:N_guesses]] idx = id_test[i] ids += [idx] * N_guesses cts += le.inverse_transform(Trans2).tolist()[0] ExtResult = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) ExtResult.to_csv(NAME + '.csv', index = False) print 'External result written to CSV!' print datetime.datetime.now().time()
def blenderXGMain(FOLDER, fraction, namepart): RAW = 'RawResults/' NAME = RAW + namepart print 'Starting', datetime.datetime.now().time() warnings.filterwarnings("ignore") N_guesses = 5 N_trees = 100 ExtTest = pd.DataFrame() IntTest = pd.DataFrame() ExtResult = pd.DataFrame(columns = ('id', 'country')) Labels = ['TargetIsAU', 'TargetIsCA', 'TargetIsDE', 'TargetIsES', 'TargetIsFR', 'TargetIsGB', 'TargetIsIT', 'TargetIsNDF', 'TargetIsNL', 'TargetIsPT', 'TargetIsUS', 'TargetIsother'] Countries = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other'] # Deal with the files being read in 'alphabetical' order Twisted_names = ['AU', 'CA', 'US', 'other', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT'] os.chdir('/Users/piotrgrudzien/Desktop/Airbnb') Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1) Train = Train.sample(frac = fraction).reset_index(drop = True) Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1) for f in FOLDER: # remove duplicates # cfe = list(set(cfe)) if(FOLDER.index(f) == 0): Train_Feats = bf.getFeats(Train, 'Train', f) Test_Feats = bf.getFeats(Test, 'Test', f) else: Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f), how='left', left_on = 'id', right_on = 'id') Test_Feats = pd.merge(Test_Feats, bf.getFeats(Test, 'Test', f), how='left', left_on = 'id', right_on = 'id') print 'Added folder', f, 'features:', len(Test_Feats.columns) - 1 # Drop all the 'target' feats IntLabels = pd.DataFrame() IntLabels['id'] = Train_Feats['id'] for col in Train_Feats.columns: if('Target' in col): IntLabels[col] = Train_Feats[col] Train_Feats.drop(col, axis = 1, inplace = True) # Add a column of zeros to Test if it appears in Train for col in Train_Feats.columns: if(col not in Test_Feats.columns): Test_Feats[col] = 0 # Remove all columns from Test that don't appear in Train for col in Test_Feats.columns: if(col not in Train_Feats.columns): Test_Feats.drop(col, axis = 1, inplace = True) # Remove all id columns for col in Train_Feats.columns: if(('id' in col) & (col != 'id')): if(col in Train_Feats.columns): Train_Feats.drop(col, axis = 1, inplace = True) for col in Test_Feats.columns: if(('id' in col) & (col != 'id')): if(col in Test_Feats.columns): Test_Feats.drop(col, axis = 1, inplace = True) # And add the true label Train_Feats['Target'] = Train['country_destination'] Train_Feats = Train_Feats.sort(axis = 1) Test_Feats = Test_Feats.sort(axis = 1) Train_Feats.fillna(-1, inplace = True) Test_Feats.fillna(-1, inplace = True) # Array of all possible labels Labels = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other'] l = 'TargetIs' LabelIndex = [l + x for x in Labels] print 'Total number of labels:', len(LabelIndex) Remove = LabelIndex + ['TargetIsNull'] FirstActiveYear = 'FirstActiveYear_x' FirstActiveMonth = 'FirstActiveMonth_x' Train_Feats['IsTest'] = ((Train_Feats[FirstActiveYear].isin([2014])) & (Train_Feats[FirstActiveMonth] > 3)).astype(int) ############# IntLabels['IsTest'] = Train_Feats['IsTest'] IntLabels['country'] = IntLabels.apply(getCountryID, axis = 1) IntTest = Train_Feats.drop('Target', axis = 1) ExtTest = Test_Feats print 'Number of features in Train:', len(IntTest.columns) - 2 print 'Number of features in Test:', len(ExtTest.columns) - 1 RFC = getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME) print 'Second layer RFC trained' del IntTest Probs = RFC.predict_proba(ExtTest.drop('id', axis = 1)) pd.DataFrame(Probs).to_csv(NAME + 'ExtProbs.csv', index = False) print 'Probs written to CSV' id_test = ExtTest['id'] le = LabelEncoder() ids = [] cts = [] le.fit(Twisted_names) for i in range(len(Probs)): df = pd.DataFrame(columns = range(0, 12)) df.loc[i, :] = Probs[i, :] Trans1 = df.apply(getReverseOrder, axis = 1) Trans2 = Trans1[Trans1.columns[::-1][:N_guesses]] idx = id_test[i] ids += [idx] * N_guesses cts += le.inverse_transform(Trans2).tolist()[0] ExtResult = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) ExtResult.to_csv(NAME + '.csv', index = False) print 'External result written to CSV!' print datetime.datetime.now().time()