Python BuildFeats Examples

Programming Language: Python

Class/Type: BuildFeats

Examples at hotexamples.com: 4

Python BuildFeats - 4 examples found. These are the top rated real world Python examples of BuildFeats extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

getFeats(4)

Example #1

Show file

File: ReadData.py Project: piotrgrudzien/Airbnb-Kaggle

    indices = np.argsort(importances)[::-1]
    # Print the feature ranking
    #print 'Model score:', RFC.score(Feats_test, Labels_test)
    print("Feature ranking:")
    for f in range(0, 50):
        print("%d. feature %s (%f)" % (f + 1, Feats.columns[indices[f]], importances[indices[f]]))
    return RFC

#AgeGender = pd.read_csv('age_gender_bkts.csv')
#Countries = pd.read_csv('countries.csv')
#Sessions = pd.read_csv('sessions.csv')
    
#For now drop 'date_first_booking' - use it later with sessions maybe
Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1)
Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1)
Train_Feats = bf.getFeats(Train, 'Train')
Test_Feats = bf.getFeats(Test, 'Test')
##
# Things that appear in the training set and not in the test set: add a column of all zeros
for col in Train_Feats.columns:
    if(col not in Test_Feats.columns):
        Test_Feats[col] = np.zeros(Test_Feats['id'].shape)
        
# Things that appear in the test set and not in the training set: remove these columns
for col in Test_Feats.columns:
    if(col not in Train_Feats.columns):
        print 'Dropping:', col
        Test_Feats.drop(col, axis = 1, inplace = True)

# Get rid of the columns left after the sessions join
Train_Feats.drop(['user_id_x', 'user_id_y'], axis = 1, inplace = True)

Example #2

Show file

File: ReadDataXG.py Project: piotrgrudzien/Airbnb-Kaggle

def readDataMain(FOLDER, skip):

    warnings.filterwarnings("ignore")

    # For now drop 'date_first_booking' - use it later with sessions maybe
    Train = pd.read_csv("train_users_2.csv").drop("date_first_booking", axis=1)
    Test = pd.read_csv("test_users.csv").drop("date_first_booking", axis=1)
    Train_Feats = bf.getFeats(Train, "Train", FOLDER)
    Test_Feats = bf.getFeats(Test, "Test", FOLDER)

    # Things that appear in the training set and not in the test set: add a column of all zeros
    for col in Train_Feats.columns:
        if col not in Test_Feats.columns:
            Test_Feats[col] = np.zeros(Test_Feats["id"].shape)

    # Things that appear in the test set and not in the training set: remove these columns
    for col in Test_Feats.columns:
        if col not in Train_Feats.columns:
            print "Dropping:", col
            Test_Feats.drop(col, axis=1, inplace=True)

    # Get rid of the columns left after the sessions join
    # Only do this when your merging - comment out when using only BasicInfo
    if "user_id_x" in Train_Feats.columns:
        Train_Feats.drop(["user_id_x", "user_id_y"], axis=1, inplace=True)
        Test_Feats.drop(["user_id_x", "user_id_y"], axis=1, inplace=True)

    # Sometimes happens to be there for some reason...
    if "user_id" in Train_Feats.columns:
        Train_Feats.drop("user_id", axis=1, inplace=True)
    if "user_id" in Test_Feats.columns:
        Test_Feats.drop("user_id", axis=1, inplace=True)

    Train_Feats = Train_Feats.sort(axis=1)
    Test_Feats = Test_Feats.sort(axis=1)
    Train_Feats.fillna(-1, inplace=True)
    Test_Feats.fillna(-1, inplace=True)

    # Array of all possible labels
    Labels = ["AU", "CA", "DE", "ES", "FR", "GB", "IT", "NDF", "NL", "PT", "US", "other"]
    l = "TargetIs"
    LabelIndex = [l + x for x in Labels]
    print "Total number of labels:", len(LabelIndex)
    Remove = LabelIndex + ["TargetIsNull"]

    print "Number of features in Train:", len(Train_Feats.columns) - 1
    print "Number of features in Test:", len(Test_Feats.columns) - 1

    #    FOLDER = 'BasicInfoOnly'

    # To save RAM write Test_Feats and Train_Feats to CSV and delete all DataFrames from memory
    #    Test_Feats.to_csv(FOLDER + '/Test_Feats.csv', index = False)
    #    Train_Feats.to_csv(FOLDER + '/Train_Feats.csv', index = False)
    del Test
    del Train
    # del Test_Feats
    # del Train_Feats

    #    Train_Feats_Only = Train_Feats.loc[~((Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3)), :]

    # 0.7 of data for first layer, 0.3 for second layer
    X_train, X_test, y_train, y_test = train_test_split(
        Train_Feats.drop(Remove, axis=1), Train_Feats.loc[:, LabelIndex], test_size=0.5
    )

    #    X_test = X_test.append(Train_Feats.loc[(Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3), :].drop(Remove, axis = 1))
    #    y_test = y_test.append(Train_Feats.loc[(Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3), LabelIndex])
    X_test["IsTest"] = ((X_test["FirstActiveYear"].isin([2014])) & (X_test["FirstActiveMonth"] > 3)).astype(int)
    y_test["IsTest"] = X_test["IsTest"]

    del Train_Feats
    #    del Test_Feats

    fa = "FirstActive"
    FirstActiveDrop = [
        fa + "Year",
        fa + "Month",
        fa + "DayOfMonth",
        fa + "WeekOfYear",
        fa + "DayOfWeek",
        fa + "Quarter",
        fa + "Hour",
    ]
    # Dropping this in most cases cause was added only for the train/test split
    if "BasicInfo" in FOLDER:
        X_train.drop(FirstActiveDrop, axis=1)
        X_test.drop(FirstActiveDrop, axis=1)

    # Again, save these to files rather than keep them in RAM

    #    X_test.reset_index(drop = True).to_csv(FOLDER + '/X_test.csv', index = False)
    #    del X_test
    #    y_train.reset_index(drop = True).to_csv(FOLDER + '/y_train.csv', index = False)
    #    del y_train
    #    y_test.reset_index(drop = True).to_csv(FOLDER + '/y_test.csv', index = False)
    #    X_train.reset_index(drop = True).to_csv(FOLDER + '/X_train.csv', index = False)
    #    del X_train

    #    X_train = pd.read_csv(FOLDER + '/X_train.csv')
    #    y_train = pd.read_csv(FOLDER + '/y_train.csv')
    #    X_test = pd.read_csv(FOLDER + '/X_test.csv')
    #    Test_Feats = pd.read_csv(FOLDER + '/Test_Feats.csv')

    y_test.to_csv(FOLDER + "/IntTestLabels.csv")
    del y_test
    # Train a separate classifier for each label
    # for li in LabelIndex:
    #    if(LabelIndex.index(li) == 7):
    #        Probs = pd.DataFrame()
    #        Ext_Test_Probs = pd.DataFrame()
    #        print datetime.datetime.now().time()
    #        print 'Training model no', LabelIndex.index(li)
    #        # enrich models in their data of interest
    #        Enriched_X_train, Enriched_y_train = enrich(pd.read_csv(FOLDER + '/X_train.csv'), pd.read_csv(FOLDER + '/y_train.csv'), li)
    #        # Only for reference - for scoring models
    #        #Brain = trainModel(Enriched_X_train.drop('id', axis = 1), Enriched_y_train[li], X_test.drop('id', axis = 1), y_test[li])
    #        Brain = trainModel(Enriched_X_train.drop('id', axis = 1), Enriched_y_train[li], None, None)
    #        print datetime.datetime.now().time()
    #        print 'Getting probs for model no', LabelIndex.index(li)
    #        Probs['id'] = pd.read_csv(FOLDER + '/X_test.csv')['id']
    #        Probs[li] = Brain.predict_proba(pd.read_csv(FOLDER + '/X_test.csv').drop('id', axis = 1))[:, 1]
    #        # External data
    #        Ext_Test_Probs['id'] = pd.read_csv(FOLDER + '/Test_Feats.csv')['id']
    #        Ext_Test_Probs[str(li)] = Brain.predict_proba(pd.read_csv(FOLDER + '/Test_Feats.csv').drop('id', axis = 1).drop(Remove, axis = 1))[:, 1]
    #        FileNameIndex = LabelIndex.index(li)
    #        Probs.to_csv(FOLDER + '/IntTestProbs' + str(FileNameIndex) + '.csv')
    #        Ext_Test_Probs.to_csv(FOLDER + '/ExtTestProbs' + str(FileNameIndex) + '.csv')

    # Train classifier for all 2-label combinations
    Freqs = [0.27, 0.66, 0.51, 1.06, 2.50, 1.02, 1.15, 58.34, 0.42, 0.06, 29.55, 4.46]
    Repeat = [10, 10, 10, 10, 7, 10, 10, 1, 10, 10, 4, 6]
    for li1 in LabelIndex:
        for li2 in LabelIndex:
            if li1 is not li2:
                for i in range(0, 2):
                    #                for i in range(0, int(np.true_divide(Freqs[LabelIndex.index(li2)], 2 * Freqs[LabelIndex.index(li1)])) + 1):
                    if skip:
                        if LabelIndex.index(li1) < 7 or (LabelIndex.index(li1) == 7 and LabelIndex.index(li2) < 2):
                            continue
                    name = li1 + "with" + li2 + str(i + 1)
                    Probs = pd.DataFrame()
                    Ext_Test_Probs = pd.DataFrame()
                    print datetime.datetime.now().time()
                    print "Training model", name
                    # enrich models in their data of interest
                    #                Enriched_X_train, Enriched_y_train = enrichPairwise(pd.read_csv(FOLDER + '/X_train.csv'), pd.read_csv(FOLDER + '/y_train.csv'), li1, li2)
                    Enriched_X_train, Enriched_y_train = enrichPairwise(X_train, y_train, li1, li2)
                    Brain = trainModel(Enriched_X_train.drop("id", axis=1), Enriched_y_train[li1], None, None)
                    print datetime.datetime.now().time()
                    print "Getting probs for model no", name
                    #                Probs['id'] = pd.read_csv(FOLDER + '/X_test.csv')['id']
                    Probs["id"] = X_test["id"]
                    #                Probs[name] = Brain.predict_proba(pd.read_csv(FOLDER + '/X_test.csv').drop('id', axis = 1))[:, 1]
                    Probs[name] = Brain.predict_proba(X_test.drop(["id", "IsTest"], axis=1))[:, 1]
                    Probs["IsTest"] = X_test["IsTest"]
                    FileNameIndex = str(LabelIndex.index(li1)) + "with" + str(LabelIndex.index(li2)) + "no" + str(i + 1)
                    Probs.to_csv(FOLDER + "/IntTestProbs" + FileNameIndex + ".csv")

                    # External data
                    #                Ext_Test_Probs['id'] = pd.read_csv(FOLDER + '/Test_Feats.csv')['id']
                    Ext_Test_Probs["id"] = Test_Feats["id"]
                    Ext_Test_Probs[name] = Brain.predict_proba(Test_Feats.drop("id", axis=1).drop(Remove, axis=1))[:, 1]
                    Ext_Test_Probs.to_csv(FOLDER + "/ExtTestProbs" + FileNameIndex + ".csv")
                    gc.collect()

    print "Probs written to CSV"
    print datetime.datetime.now().time()

Example #3

Show file

File: BlenderRawXG.py Project: piotrgrudzien/Airbnb-Kaggle

def blenderXGMain(FOLDER, TOP, fraction):

    RAW = 'RawResults/'
    NAME = RAW + '100FeatsEach0.4sample1'
    print 'Starting', datetime.datetime.now().time()
    warnings.filterwarnings("ignore")
    N_guesses = 5
    N_trees = 100
    ExtTest = pd.DataFrame()
    IntTest = pd.DataFrame()
    ExtResult = pd.DataFrame(columns = ('id', 'country'))
    
    Labels = ['TargetIsAU', 'TargetIsCA', 'TargetIsDE', 'TargetIsES', 'TargetIsFR', 'TargetIsGB', 'TargetIsIT', 'TargetIsNDF', 'TargetIsNL', 'TargetIsPT', 'TargetIsUS', 'TargetIsother']
    
    Countries = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    
    # Deal with the files being read in 'alphabetical' order
    Twisted_names = ['AU', 'CA', 'US', 'other', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT']
    
    Chosen_Feats = []
    for i in range(0, len(FOLDER)):
        feats = pd.read_csv(RAW + '/' + FOLDER[i] + 'FeatImp.csv', nrows = TOP[i])['0'].tolist()
#        feats = [x + '_' + str(i) for x in feats]
        Chosen_Feats.append(feats)
    
    os.chdir('/Users/piotrgrudzien/Desktop/Airbnb')
    Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1)
    Train = Train.sample(frac = fraction).reset_index(drop = True)
    Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1)
    cf = []
    for f in FOLDER:
        cf = Chosen_Feats[FOLDER.index(f)] + ['id']
        # Need to add targets to train
        if(f is FOLDER[-1]):
            for col in Train_Feats.columns:
                if('Target' in col):
                    cf.append(col)
        # remove duplicates
#        cfe = list(set(cfe))
        if(FOLDER.index(f) == 0):
            Train_Feats = bf.getFeats(Train, 'Train', f)[cf]
            
        else:
            if(f is FOLDER[-1]):
                Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f)[cf + ['FirstActiveYear', 'FirstActiveMonth'] + Labels], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra'))
            else:
                Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f)[cf], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra'))

        
        if(FOLDER.index(f) == 0):
            Test_Feats = bf.getFeats(Test, 'Test', f)
            for col in cf:
                if(col not in Test_Feats.columns):
                    Test_Feats[col] = np.zeros(Test_Feats['id'].shape)
            Test_Feats = Test_Feats[cf]
        else:
            Taken = bf.getFeats(Test, 'Test', f)
            # Things that appear in the training set and not in the test set: add a column of all zeros
            for col in cf:
                if(col not in Taken.columns):
                    Test_Feats[col] = np.zeros(Test_Feats['id'].shape)
            Test_Feats = pd.merge(Test_Feats, Taken[cf], how='left', left_on = 'id', right_on = 'id', suffixes = ('', '_extra'))



        print 'Added folder', f, 'features:', len(Test_Feats.columns) - 1
        
    # Drop all repeated feats
    for col in Train_Feats.columns:
        if('extra' in col):
            if(col in Train_Feats.columns):
                Train_Feats.drop(col, axis = 1, inplace = True)
    for col in Test_Feats.columns:
        if('extra' in col):
            if(col in Test_Feats.columns):
                Test_Feats.drop(col, axis = 1, inplace = True)
    
    # Drop all the 'target' feats
    IntLabels = pd.DataFrame()
    IntLabels['id'] = Train_Feats['id']
    for col in Train_Feats.columns:
        if('Target' in col):
            IntLabels[col] = Train_Feats[col]
            Train_Feats.drop(col, axis = 1, inplace = True)
        
    # And add the true label 
    Train_Feats['Target'] = Train['country_destination']
    
    Train_Feats = Train_Feats.sort(axis = 1)
    Test_Feats = Test_Feats.sort(axis = 1)
    Train_Feats.fillna(-1, inplace = True)
    Test_Feats.fillna(-1, inplace = True)
    
    # Array of all possible labels
    Labels = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    l = 'TargetIs'
    LabelIndex = [l + x for x in Labels]
    print 'Total number of labels:', len(LabelIndex)
    Remove = LabelIndex + ['TargetIsNull']
    
    Train_Feats['IsTest'] = ((Train_Feats['FirstActiveYear'].isin([2014])) & (Train_Feats['FirstActiveMonth'] > 3)).astype(int)
    # Only include FirstActive in BasicInfo
    if('BasicInfo' not in FOLDER):
        for col in ['FirstActiveYear', 'FirstActiveMonth']:
            if(col in Train_Feats.columns):
                Train_Feats.drop(col, axis = 1, inplace = True)
            if(col in Test_Feats.columns):
                Test_Feats.drop(col, axis = 1, inplace = True)
    


    #############
    IntLabels['IsTest'] = Train_Feats['IsTest']
    IntLabels['country'] = IntLabels.apply(getCountryID, axis = 1)

    
    IntTest = Train_Feats.drop('Target', axis = 1)
    ExtTest = Test_Feats
    
    print 'Number of features in Train:', len(IntTest.columns) - 2
    print 'Number of features in Test:', len(ExtTest.columns) - 1 
    
    RFC= getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME, lr=0.01)
    RFC= getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME, lr=0.1)    
    print 'Second layer RFC trained'
    
    del IntTest
    
    Probs = RFC.predict_proba(ExtTest.drop('id', axis = 1))
    
    pd.DataFrame(Probs).to_csv(NAME + 'ExtProbs.csv', index = False)
    print 'Probs written to CSV'
    
    id_test = ExtTest['id']
    
    le = LabelEncoder()
    ids = []
    cts = []
    le.fit(Twisted_names)
    for i in range(len(Probs)):
        df = pd.DataFrame(columns = range(0, 12))
        df.loc[i, :] = Probs[i, :]
        Trans1 = df.apply(getReverseOrder, axis = 1)
        Trans2 = Trans1[Trans1.columns[::-1][:N_guesses]]
        idx = id_test[i]
        ids += [idx] * N_guesses
        cts += le.inverse_transform(Trans2).tolist()[0]
    
    ExtResult = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    
    ExtResult.to_csv(NAME + '.csv', index = False)
    print 'External result written to CSV!'
    print datetime.datetime.now().time()

Example #4

Show file

File: BlenderRawSimple.py Project: piotrgrudzien/Airbnb-Kaggle

def blenderXGMain(FOLDER, fraction, namepart):

    RAW = 'RawResults/'
    NAME = RAW + namepart
    print 'Starting', datetime.datetime.now().time()
    warnings.filterwarnings("ignore")
    N_guesses = 5
    N_trees = 100
    ExtTest = pd.DataFrame()
    IntTest = pd.DataFrame()
    ExtResult = pd.DataFrame(columns = ('id', 'country'))
    
    Labels = ['TargetIsAU', 'TargetIsCA', 'TargetIsDE', 'TargetIsES', 'TargetIsFR', 'TargetIsGB', 'TargetIsIT', 'TargetIsNDF', 'TargetIsNL', 'TargetIsPT', 'TargetIsUS', 'TargetIsother']
    
    Countries = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    
    # Deal with the files being read in 'alphabetical' order
    Twisted_names = ['AU', 'CA', 'US', 'other', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT']
    
    os.chdir('/Users/piotrgrudzien/Desktop/Airbnb')
    Train = pd.read_csv('train_users_2.csv').drop('date_first_booking', axis=1)
    Train = Train.sample(frac = fraction).reset_index(drop = True)
    Test = pd.read_csv('test_users.csv').drop('date_first_booking', axis=1)
    for f in FOLDER:
        # remove duplicates
#        cfe = list(set(cfe))
        if(FOLDER.index(f) == 0):
            Train_Feats = bf.getFeats(Train, 'Train', f)
            Test_Feats = bf.getFeats(Test, 'Test', f)
        else:
            Train_Feats = pd.merge(Train_Feats, bf.getFeats(Train, 'Train', f), how='left', left_on = 'id', right_on = 'id')
            Test_Feats = pd.merge(Test_Feats, bf.getFeats(Test, 'Test', f), how='left', left_on = 'id', right_on = 'id')

        print 'Added folder', f, 'features:', len(Test_Feats.columns) - 1
        
        # Drop all the 'target' feats
    IntLabels = pd.DataFrame()
    IntLabels['id'] = Train_Feats['id']
    for col in Train_Feats.columns:
        if('Target' in col):
            IntLabels[col] = Train_Feats[col]
            Train_Feats.drop(col, axis = 1, inplace = True)
        
    # Add a column of zeros to Test if it appears in Train
    for col in Train_Feats.columns:
        if(col not in Test_Feats.columns):
            Test_Feats[col] = 0
            
    # Remove all columns from Test that don't appear in Train
    for col in Test_Feats.columns:
        if(col not in Train_Feats.columns):
            Test_Feats.drop(col, axis = 1, inplace = True)
            
    # Remove all id columns
    for col in Train_Feats.columns:
        if(('id' in col) & (col != 'id')):
            if(col in Train_Feats.columns):
                Train_Feats.drop(col, axis = 1, inplace = True)
    for col in Test_Feats.columns:
        if(('id' in col) & (col != 'id')):
            if(col in Test_Feats.columns):
                Test_Feats.drop(col, axis = 1, inplace = True)
       
    # And add the true label 
    Train_Feats['Target'] = Train['country_destination']
    
    Train_Feats = Train_Feats.sort(axis = 1)
    Test_Feats = Test_Feats.sort(axis = 1)
    Train_Feats.fillna(-1, inplace = True)
    Test_Feats.fillna(-1, inplace = True)
    
    # Array of all possible labels
    Labels = ['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US', 'other']
    l = 'TargetIs'
    LabelIndex = [l + x for x in Labels]
    print 'Total number of labels:', len(LabelIndex)
    Remove = LabelIndex + ['TargetIsNull']
    
    FirstActiveYear = 'FirstActiveYear_x'
    FirstActiveMonth = 'FirstActiveMonth_x'
    Train_Feats['IsTest'] = ((Train_Feats[FirstActiveYear].isin([2014])) & (Train_Feats[FirstActiveMonth] > 3)).astype(int)
    


    #############
    IntLabels['IsTest'] = Train_Feats['IsTest']
    IntLabels['country'] = IntLabels.apply(getCountryID, axis = 1)

    
    IntTest = Train_Feats.drop('Target', axis = 1)
    ExtTest = Test_Feats
    
    print 'Number of features in Train:', len(IntTest.columns) - 2
    print 'Number of features in Test:', len(ExtTest.columns) - 1 
    
    RFC = getSsRFC(IntTest, IntLabels[['country', 'IsTest']], Countries, 'Int', NAME)    
    print 'Second layer RFC trained'
    
    del IntTest
    
    
    Probs = RFC.predict_proba(ExtTest.drop('id', axis = 1))

    pd.DataFrame(Probs).to_csv(NAME + 'ExtProbs.csv', index = False)
    print 'Probs written to CSV'
    
    id_test = ExtTest['id']
    
    le = LabelEncoder()
    ids = []
    cts = []
    le.fit(Twisted_names)
    for i in range(len(Probs)):
        df = pd.DataFrame(columns = range(0, 12))
        df.loc[i, :] = Probs[i, :]
        Trans1 = df.apply(getReverseOrder, axis = 1)
        Trans2 = Trans1[Trans1.columns[::-1][:N_guesses]]
        idx = id_test[i]
        ids += [idx] * N_guesses
        cts += le.inverse_transform(Trans2).tolist()[0]
    
    ExtResult = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    
    ExtResult.to_csv(NAME + '.csv', index = False)
    print 'External result written to CSV!'
    print datetime.datetime.now().time()