Ejemplo n.º 1
    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    #print 'Model score:', RFC.score(Feats_test, Labels_test)
    print("Feature ranking:")
    for f in range(0, 50):
        print("%d. feature %s (%f)" % (f + 1, Feats.columns[indices[f]], importances[indices[f]]))
    return RFC

#AgeGender = pd.read_csv('age_gender_bkts.csv')
#Countries = pd.read_csv('countries.csv')
#Sessions = pd.read_csv('sessions.csv')
#For now drop 'date_first_booking' - use it later with sessions maybe
Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1)
Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1)
Train_Feats = bf.getFeats(Train, 'Train')
Test_Feats = bf.getFeats(Test, 'Test')
# Things that appear in the training set and not in the test set: add a column of all zeros
for col in Train_Feats.columns:
    if(col not in Test_Feats.columns):
        Test_Feats[col] = np.zeros(Test_Feats['id'].shape)
# Things that appear in the test set and not in the training set: remove these columns
for col in Test_Feats.columns:
    if(col not in Train_Feats.columns):
        print 'Dropping:', col
        Test_Feats.drop(col, axis = 1, inplace = True)

# Get rid of the columns left after the sessions join
Train_Feats.drop(['user_id_x', 'user_id_y'], axis = 1, inplace = True)
Ejemplo n.º 2
def readDataMain(FOLDER, skip):


    # For now drop 'date_first_booking' - use it later with sessions maybe
    Train = pd.read_csv("train_users_2.csv").drop("date_first_booking", axis=1)
    Test = pd.read_csv("test_users.csv").drop("date_first_booking", axis=1)
    Train_Feats = bf.getFeats(Train, "Train", FOLDER)
    Test_Feats = bf.getFeats(Test, "Test", FOLDER)

    # Things that appear in the training set and not in the test set: add a column of all zeros
    for col in Train_Feats.columns:
        if col not in Test_Feats.columns:
            Test_Feats[col] = np.zeros(Test_Feats["id"].shape)

    # Things that appear in the test set and not in the training set: remove these columns
    for col in Test_Feats.columns:
        if col not in Train_Feats.columns:
            print "Dropping:", col
            Test_Feats.drop(col, axis=1, inplace=True)

    # Get rid of the columns left after the sessions join
    # Only do this when your merging - comment out when using only BasicInfo
    if "user_id_x" in Train_Feats.columns:
        Train_Feats.drop(["user_id_x", "user_id_y"], axis=1, inplace=True)
        Test_Feats.drop(["user_id_x", "user_id_y"], axis=1, inplace=True)

    # Sometimes happens to be there for some reason...
    if "user_id" in Train_Feats.columns:
        Train_Feats.drop("user_id", axis=1, inplace=True)
    if "user_id" in Test_Feats.columns:
        Test_Feats.drop("user_id", axis=1, inplace=True)

    Train_Feats = Train_Feats.sort(axis=1)
    Test_Feats = Test_Feats.sort(axis=1)
    Train_Feats.fillna(-1, inplace=True)
    Test_Feats.fillna(-1, inplace=True)

    # Array of all possible labels
    Labels = ["AU", "CA", "DE", "ES", "FR", "GB", "IT", "NDF", "NL", "PT", "US", "other"]
    l = "TargetIs"
    LabelIndex = [l + x for x in Labels]
    print "Total number of labels:", len(LabelIndex)
    Remove = LabelIndex + ["TargetIsNull"]

    print "Number of features in Train:", len(Train_Feats.columns) - 1
    print "Number of features in Test:", len(Test_Feats.columns) - 1

    #    FOLDER = 'BasicInfoOnly'

    # To save RAM write Test_Feats and Train_Feats to CSV and delete all DataFrames from memory
    #    Test_Feats.to_csv(FOLDER + '/Test_Feats.csv', index = False)
    #    Train_Feats.to_csv(FOLDER + '/Train_Feats.csv', index = False)
    del Test
    del Train
    # del Test_Feats
    # del Train_Feats

    #    Train_Feats_Only = Train_Feats.loc[~((Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3)), :]

    # 0.7 of data for first layer, 0.3 for second layer
    X_train, X_test, y_train, y_test = train_test_split(
        Train_Feats.drop(Remove, axis=1), Train_Feats.loc[:, LabelIndex], test_size=0.5

    #    X_test = X_test.append(Train_Feats.loc[(Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3), :].drop(Remove, axis = 1))
    #    y_test = y_test.append(Train_Feats.loc[(Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3), LabelIndex])
    X_test["IsTest"] = ((X_test["FirstActiveYear"].isin([2014])) & (X_test["FirstActiveMonth"] > 3)).astype(int)
    y_test["IsTest"] = X_test["IsTest"]

    del Train_Feats
    #    del Test_Feats

    fa = "FirstActive"
    FirstActiveDrop = [
        fa + "Year",
        fa + "Month",
        fa + "DayOfMonth",
        fa + "WeekOfYear",
        fa + "DayOfWeek",
        fa + "Quarter",
        fa + "Hour",
    # Dropping this in most cases cause was added only for the train/test split
    if "BasicInfo" in FOLDER:
        X_train.drop(FirstActiveDrop, axis=1)
        X_test.drop(FirstActiveDrop, axis=1)

    # Again, save these to files rather than keep them in RAM

    #    X_test.reset_index(drop = True).to_csv(FOLDER + '/X_test.csv', index = False)
    #    del X_test
    #    y_train.reset_index(drop = True).to_csv(FOLDER + '/y_train.csv', index = False)
    #    del y_train
    #    y_test.reset_index(drop = True).to_csv(FOLDER + '/y_test.csv', index = False)
    #    X_train.reset_index(drop = True).to_csv(FOLDER + '/X_train.csv', index = False)
    #    del X_train

    #    X_train = pd.read_csv(FOLDER + '/X_train.csv')
    #    y_train = pd.read_csv(FOLDER + '/y_train.csv')
    #    X_test = pd.read_csv(FOLDER + '/X_test.csv')
    #    Test_Feats = pd.read_csv(FOLDER + '/Test_Feats.csv')

    y_test.to_csv(FOLDER + "/IntTestLabels.csv")
    del y_test
    # Train a separate classifier for each label
    # for li in LabelIndex:
    #    if(LabelIndex.index(li) == 7):
    #        Probs = pd.DataFrame()
    #        Ext_Test_Probs = pd.DataFrame()
    #        print datetime.datetime.now().time()
    #        print 'Training model no', LabelIndex.index(li)
    #        # enrich models in their data of interest
    #        Enriched_X_train, Enriched_y_train = enrich(pd.read_csv(FOLDER + '/X_train.csv'), pd.read_csv(FOLDER + '/y_train.csv'), li)
    #        # Only for reference - for scoring models
    #        #Brain = trainModel(Enriched_X_train.drop('id', axis = 1), Enriched_y_train[li], X_test.drop('id', axis = 1), y_test[li])
    #        Brain = trainModel(Enriched_X_train.drop('id', axis = 1), Enriched_y_train[li], None, None)
    #        print datetime.datetime.now().time()
    #        print 'Getting probs for model no', LabelIndex.index(li)
    #        Probs['id'] = pd.read_csv(FOLDER + '/X_test.csv')['id']
    #        Probs[li] = Brain.predict_proba(pd.read_csv(FOLDER + '/X_test.csv').drop('id', axis = 1))[:, 1]
    #        # External data
    #        Ext_Test_Probs['id'] = pd.read_csv(FOLDER + '/Test_Feats.csv')['id']
    #        Ext_Test_Probs[str(li)] = Brain.predict_proba(pd.read_csv(FOLDER + '/Test_Feats.csv').drop('id', axis = 1).drop(Remove, axis = 1))[:, 1]
    #        FileNameIndex = LabelIndex.index(li)
    #        Probs.to_csv(FOLDER + '/IntTestProbs' + str(FileNameIndex) + '.csv')
    #        Ext_Test_Probs.to_csv(FOLDER + '/ExtTestProbs' + str(FileNameIndex) + '.csv')

    # Train classifier for all 2-label combinations
    Freqs = [0.27, 0.66, 0.51, 1.06, 2.50, 1.02, 1.15, 58.34, 0.42, 0.06, 29.55, 4.46]
    Repeat = [10, 10, 10, 10, 7, 10, 10, 1, 10, 10, 4, 6]
    for li1 in LabelIndex:
        for li2 in LabelIndex:
            if li1 is not li2:
                for i in range(0, 2):
                    #                for i in range(0, int(np.true_divide(Freqs[LabelIndex.index(li2)], 2 * Freqs[LabelIndex.index(li1)])) + 1):
                    if skip:
                        if LabelIndex.index(li1) < 7 or (LabelIndex.index(li1) == 7 and LabelIndex.index(li2) < 2):
                    name = li1 + "with" + li2 + str(i + 1)
                    Probs = pd.DataFrame()
                    Ext_Test_Probs = pd.DataFrame()
                    print datetime.datetime.now().time()
                    print "Training model", name
                    # enrich models in their data of interest
                    #                Enriched_X_train, Enriched_y_train = enrichPairwise(pd.read_csv(FOLDER + '/X_train.csv'), pd.read_csv(FOLDER + '/y_train.csv'), li1, li2)
                    Enriched_X_train, Enriched_y_train = enrichPairwise(X_train, y_train, li1, li2)
                    Brain = trainModel(Enriched_X_train.drop("id", axis=1), Enriched_y_train[li1], None, None)
                    print datetime.datetime.now().time()
                    print "Getting probs for model no", name
                    #                Probs['id'] = pd.read_csv(FOLDER + '/X_test.csv')['id']
                    Probs["id"] = X_test["id"]
                    #                Probs[name] = Brain.predict_proba(pd.read_csv(FOLDER + '/X_test.csv').drop('id', axis = 1))[:, 1]
                    Probs[name] = Brain.predict_proba(X_test.drop(["id", "IsTest"], axis=1))[:, 1]
                    Probs["IsTest"] = X_test["IsTest"]
                    FileNameIndex = str(LabelIndex.index(li1)) + "with" + str(LabelIndex.index(li2)) + "no" + str(i + 1)
                    Probs.to_csv(FOLDER + "/IntTestProbs" + FileNameIndex + ".csv")

                    # External data
                    #                Ext_Test_Probs['id'] = pd.read_csv(FOLDER + '/Test_Feats.csv')['id']
                    Ext_Test_Probs["id"] = Test_Feats["id"]
                    Ext_Test_Probs[name] = Brain.predict_proba(Test_Feats.drop("id", axis=1).drop(Remove, axis=1))[:, 1]
                    Ext_Test_Probs.to_csv(FOLDER + "/ExtTestProbs" + FileNameIndex + ".csv")

    print "Probs written to CSV"
    print datetime.datetime.now().time()
Ejemplo n.º 3
def blenderXGMain(FOLDER, TOP, fraction):

    RAW = 'RawResults/'
    NAME = RAW + '100FeatsEach0.4sample1'
    print 'Starting', datetime.datetime.now().time()
    N_guesses = 5
    N_trees = 100
    ExtTest = pd.DataFrame()
    IntTest = pd.DataFrame()
    ExtResult = pd.DataFrame(columns = ('id', 'country'))
    Labels = ['TargetIsAU', 'TargetIsCA', 'TargetIsDE', 'TargetIsES', 'TargetIsFR', 'TargetIsGB', 'TargetIsIT', 'TargetIsNDF', 'TargetIsNL', 'TargetIsPT', 'TargetIsUS', 'TargetIsother']
    Countries = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    # Deal with the files being read in 'alphabetical' order
    Twisted_names = ['AU', 'CA', 'US', 'other', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT']
    Chosen_Feats = []
    for i in range(0, len(FOLDER)):
        feats = pd.read_csv(RAW + '/' + FOLDER[i] + 'FeatImp.csv', nrows = TOP[i])['0'].tolist()
#        feats = [x + '_' + str(i) for x in feats]
    Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1)
    Train = Train.sample(frac = fraction).reset_index(drop = True)
    Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1)
    cf = []
    for f in FOLDER:
        cf = Chosen_Feats[FOLDER.index(f)] + ['id']
        # Need to add targets to train
        if(f is FOLDER[-1]):
            for col in Train_Feats.columns:
                if('Target' in col):
        # remove duplicates
#        cfe = list(set(cfe))
        if(FOLDER.index(f) == 0):
            Train_Feats = bf.getFeats(Train, 'Train', f)[cf]
            if(f is FOLDER[-1]):
                Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f)[cf + ['FirstActiveYear', 'FirstActiveMonth'] + Labels], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra'))
                Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f)[cf], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra'))

        if(FOLDER.index(f) == 0):
            Test_Feats = bf.getFeats(Test, 'Test', f)
            for col in cf:
                if(col not in Test_Feats.columns):
                    Test_Feats[col] = np.zeros(Test_Feats['id'].shape)
            Test_Feats = Test_Feats[cf]
            Taken = bf.getFeats(Test, 'Test', f)
            # Things that appear in the training set and not in the test set: add a column of all zeros
            for col in cf:
                if(col not in Taken.columns):
                    Test_Feats[col] = np.zeros(Test_Feats['id'].shape)
            Test_Feats = pd.merge(Test_Feats, Taken[cf], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra'))

        print 'Added folder', f, 'features:', len(Test_Feats.columns) - 1
    # Drop all repeated feats
    for col in Train_Feats.columns:
        if('extra' in col):
            if(col in Train_Feats.columns):
                Train_Feats.drop(col, axis = 1, inplace = True)
    for col in Test_Feats.columns:
        if('extra' in col):
            if(col in Test_Feats.columns):
                Test_Feats.drop(col, axis = 1, inplace = True)
    # Drop all the 'target' feats
    IntLabels = pd.DataFrame()
    IntLabels['id'] = Train_Feats['id']
    for col in Train_Feats.columns:
        if('Target' in col):
            IntLabels[col] = Train_Feats[col]
            Train_Feats.drop(col, axis = 1, inplace = True)
    # And add the true label 
    Train_Feats['Target'] = Train['country_destination']
    Train_Feats = Train_Feats.sort(axis = 1)
    Test_Feats = Test_Feats.sort(axis = 1)
    Train_Feats.fillna(-1, inplace = True)
    Test_Feats.fillna(-1, inplace = True)
    # Array of all possible labels
    Labels = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    l = 'TargetIs'
    LabelIndex = [l + x for x in Labels]
    print 'Total number of labels:', len(LabelIndex)
    Remove = LabelIndex + ['TargetIsNull']
    Train_Feats['IsTest'] = ((Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3)).astype(int)
    # Only include FirstActive in BasicInfo
    if('BasicInfo' not in FOLDER):
        for col in ['FirstActiveYear', 'FirstActiveMonth']:
            if(col in Train_Feats.columns):
                Train_Feats.drop(col, axis = 1, inplace = True)
            if(col in Test_Feats.columns):
                Test_Feats.drop(col, axis = 1, inplace = True)

    IntLabels['IsTest'] = Train_Feats['IsTest']
    IntLabels['country'] = IntLabels.apply(getCountryID, axis = 1)

    IntTest = Train_Feats.drop('Target', axis = 1)
    ExtTest = Test_Feats
    print 'Number of features in Train:', len(IntTest.columns) - 2
    print 'Number of features in Test:', len(ExtTest.columns) - 1 
    RFC= getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME, lr=0.01)
    RFC= getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME, lr=0.1)    
    print 'Second layer RFC trained'
    del IntTest
    Probs = RFC.predict_proba(ExtTest.drop('id', axis = 1))
    pd.DataFrame(Probs).to_csv(NAME + 'ExtProbs.csv', index = False)
    print 'Probs written to CSV'
    id_test = ExtTest['id']
    le = LabelEncoder()
    ids = []
    cts = []
    for i in range(len(Probs)):
        df = pd.DataFrame(columns = range(0, 12))
        df.loc[i, :] = Probs[i, :]
        Trans1 = df.apply(getReverseOrder, axis = 1)
        Trans2 = Trans1[Trans1.columns[::-1][:N_guesses]]
        idx = id_test[i]
        ids += [idx] * N_guesses
        cts += le.inverse_transform(Trans2).tolist()[0]
    ExtResult = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    ExtResult.to_csv(NAME + '.csv', index = False)
    print 'External result written to CSV!'
    print datetime.datetime.now().time()
def blenderXGMain(FOLDER, fraction, namepart):

    RAW = 'RawResults/'
    NAME = RAW + namepart
    print 'Starting', datetime.datetime.now().time()
    N_guesses = 5
    N_trees = 100
    ExtTest = pd.DataFrame()
    IntTest = pd.DataFrame()
    ExtResult = pd.DataFrame(columns = ('id', 'country'))
    Labels = ['TargetIsAU', 'TargetIsCA', 'TargetIsDE', 'TargetIsES', 'TargetIsFR', 'TargetIsGB', 'TargetIsIT', 'TargetIsNDF', 'TargetIsNL', 'TargetIsPT', 'TargetIsUS', 'TargetIsother']
    Countries = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    # Deal with the files being read in 'alphabetical' order
    Twisted_names = ['AU', 'CA', 'US', 'other', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT']
    Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1)
    Train = Train.sample(frac = fraction).reset_index(drop = True)
    Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1)
    for f in FOLDER:
        # remove duplicates
#        cfe = list(set(cfe))
        if(FOLDER.index(f) == 0):
            Train_Feats = bf.getFeats(Train, 'Train', f)
            Test_Feats = bf.getFeats(Test, 'Test', f)
            Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f), how='left', left_on = 'id', right_on = 'id')
            Test_Feats = pd.merge(Test_Feats, bf.getFeats(Test, 'Test', f), how='left', left_on = 'id', right_on = 'id')

        print 'Added folder', f, 'features:', len(Test_Feats.columns) - 1
        # Drop all the 'target' feats
    IntLabels = pd.DataFrame()
    IntLabels['id'] = Train_Feats['id']
    for col in Train_Feats.columns:
        if('Target' in col):
            IntLabels[col] = Train_Feats[col]
            Train_Feats.drop(col, axis = 1, inplace = True)
    # Add a column of zeros to Test if it appears in Train
    for col in Train_Feats.columns:
        if(col not in Test_Feats.columns):
            Test_Feats[col] = 0
    # Remove all columns from Test that don't appear in Train
    for col in Test_Feats.columns:
        if(col not in Train_Feats.columns):
            Test_Feats.drop(col, axis = 1, inplace = True)
    # Remove all id columns
    for col in Train_Feats.columns:
        if(('id' in col) & (col != 'id')):
            if(col in Train_Feats.columns):
                Train_Feats.drop(col, axis = 1, inplace = True)
    for col in Test_Feats.columns:
        if(('id' in col) & (col != 'id')):
            if(col in Test_Feats.columns):
                Test_Feats.drop(col, axis = 1, inplace = True)
    # And add the true label 
    Train_Feats['Target'] = Train['country_destination']
    Train_Feats = Train_Feats.sort(axis = 1)
    Test_Feats = Test_Feats.sort(axis = 1)
    Train_Feats.fillna(-1, inplace = True)
    Test_Feats.fillna(-1, inplace = True)
    # Array of all possible labels
    Labels = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    l = 'TargetIs'
    LabelIndex = [l + x for x in Labels]
    print 'Total number of labels:', len(LabelIndex)
    Remove = LabelIndex + ['TargetIsNull']
    FirstActiveYear = 'FirstActiveYear_x'
    FirstActiveMonth = 'FirstActiveMonth_x'
    Train_Feats['IsTest'] = ((Train_Feats[FirstActiveYear].isin([2014])) & (Train_Feats[FirstActiveMonth] > 3)).astype(int)

    IntLabels['IsTest'] = Train_Feats['IsTest']
    IntLabels['country'] = IntLabels.apply(getCountryID, axis = 1)

    IntTest = Train_Feats.drop('Target', axis = 1)
    ExtTest = Test_Feats
    print 'Number of features in Train:', len(IntTest.columns) - 2
    print 'Number of features in Test:', len(ExtTest.columns) - 1 
    RFC = getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME)    
    print 'Second layer RFC trained'
    del IntTest
    Probs = RFC.predict_proba(ExtTest.drop('id', axis = 1))

    pd.DataFrame(Probs).to_csv(NAME + 'ExtProbs.csv', index = False)
    print 'Probs written to CSV'
    id_test = ExtTest['id']
    le = LabelEncoder()
    ids = []
    cts = []
    for i in range(len(Probs)):
        df = pd.DataFrame(columns = range(0, 12))
        df.loc[i, :] = Probs[i, :]
        Trans1 = df.apply(getReverseOrder, axis = 1)
        Trans2 = Trans1[Trans1.columns[::-1][:N_guesses]]
        idx = id_test[i]
        ids += [idx] * N_guesses
        cts += le.inverse_transform(Trans2).tolist()[0]
    ExtResult = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    ExtResult.to_csv(NAME + '.csv', index = False)
    print 'External result written to CSV!'
    print datetime.datetime.now().time()