Example #1
0
def learn_dtree(data, csvfile):
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4)
    k = data.groupby(['operator'])
   # k = data.groupby(['operator'])
    f = k['isCovered'].agg({'mean_kill': np.mean, 'number of mutants': len, 'number of killed':np.sum})
    f.to_csv(csvfile)
    fig = plt.figure()
    # ax = Axes3D(fig)
    # ax = fig.add_subplot(111, projection='3d')
    plt.scatter(standardize (f['mean']),f['sum'])
    plt.ylabel('mutant_size')
    plt.xlabel('expected_kill (standatdize)')
    # print f[f['len'] > 25000] 
    # ax.set_xlabel('mean')
    # ax.set_ylabel('len')
    # ax.set_zlabel('sum')
    
    plt.show()

    # plt.show()
    # for m in k.groups:
    #   print m,len(k.groups[m]),
    data['op'] = pd.factorize(data['operator'])[0]
    data['m'] = pd.factorize(data['method'])[0]
    HLdata['c'] = pd.factorize(data['class'])[0]

    # plt.show()
    plt.close()
    x = data[['op', 'c', 'testId']].values
    y = data['isCovered'].values
    clf.fit(x,y)
    dot_data = StringIO.StringIO()
    tree.export_graphviz(clf, out_file=dot_data)
    return dot_data.getvalue()
Example #2
0
def main(stack_setting_):

    """
     [rawdata2filterdata Step]
      1. Reading raw datasets
      2. Droping useless feat columns in training set
      3. Droping useless feat columns in test set
    """

    raw_train_path = os.path.join(Config.get_string('data.path'), 
                                  stack_setting_['0-Level']['folder'],
                                  stack_setting_['0-Level']['raw']['train'])
    raw_test_path = os.path.join(Config.get_string('data.path'), 
                                 stack_setting_['0-Level']['folder'],
                                 stack_setting_['0-Level']['raw']['test'])
    print("= Reading raw datasets ...")

    names = ("age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, TARGET").split(', ')
    raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=','
    raw_train['TARGET'] = (raw_train['TARGET'].values == ' >50K').astype(np.int32)
    raw_train = raw_train.apply(lambda x: pd.factorize(x)[0])
    train_path = os.path.join(Config.get_string('data.path'), 
                              stack_setting_['0-Level']['folder'],
                              stack_setting_['0-Level']['train'])
    raw_train.to_csv(train_path, index=True, index_label='ID')


    raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=','
    raw_test['TARGET'] = (raw_test['TARGET'].values == ' >50K').astype(np.int32)
    raw_test = raw_test.apply(lambda x: pd.factorize(x)[0])
    test_path = os.path.join(Config.get_string('data.path'), 
                             stack_setting_['0-Level']['folder'],
                             stack_setting_['0-Level']['test'])
    raw_test.to_csv(test_path, index=True, index_label='ID')
def fact(df):
	pge = pd.factorize(df['page'])
	time = pd.factorize(df['Time'])
	df['pageFCT'] = pge[0]
	df['TimeFCT'] = time[0]
	
	return df
def buildModel(df):
	train_y = df['arr_del15'][:train_len]
	train_x = df[cols][:train_len]

	# transform categorical features
	train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0]
	train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0]
	train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0]
	
	pd.set_option('display.max_rows', 500)
	print(train_x)

	# train_x['origin'] = pd.factorize(train_x['origin'])[0]
	#	train_x['dest'] = pd.factorize(train_x['dest'])[0]
	# print(train_x)
	train_x = enc.fit_transform(train_x)
	print(train_x.shape)

	# Create Random Forest classifier with 50 trees
	clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1)
	clf_rf.fit(train_x.toarray(), train_y)

	del train_x, train_y
	print("Model built")
	return clf_rf
def model_data(data, LECAT=False, NAMEAN=False, NA999=False, OH=False, ONLYCONT=False, ONLYCAT=False, ONLYCATOH=False, COLSREMOVAL=False, cols=[], maxCategories=300):

    data = data.copy()

    cat_var = list(data.select_dtypes(["object"]).columns)
    cont_var = list(data.select_dtypes(["float", "int"]).columns)

    if COLSREMOVAL:
        data = data.drop(cols, 1, inplace=False)
        cat_var = list(data.select_dtypes(["object"]).columns)
        cont_var = list(data.select_dtypes(["float", "int"]).columns)


    if NAMEAN:
        for col in cont_var:
            data.loc[data[col].isnull(), col] = data[col].mean()

    if NA999:
        for col in cont_var:
            data.loc[data[col].isnull(), col] = -999

    if LECAT:
        for col in data[cat_var]: data[col] = pd.factorize(data[col])[0]

    if OH:
        cols2dummy = [col for col in data[cat_var] if len(data[col].unique()) <= maxCategories]
        colsNot2dummy = [col for col in data[cat_var] if len(data[col].unique()) > maxCategories]
        data = pd.get_dummies(data, dummy_na=True, columns=cols2dummy)

        #binning
        for col in colsNot2dummy:
            data[col] = pd.factorize(data[col])[0]
            dcb = DummycolumnsBins(cols=col, prefix=col, nb_bins=2000)
            dcb.fit(data)
            pd_binned = dcb.transform(data)
            data = pd.concat([data,pd_binned],1)
    if ONLYCONT:
        data = data[cont_var]

    if ONLYCAT:
        test_idx = data['ID']
        Y = data['target']
        data = data[cat_var]
        data['ID'] = test_idx
        data['target'] = Y

    if ONLYCATOH:
        test_idx = data['ID']
        Y = data['target']
        cols = list(set(data.columns).difference(set(cont_var))) ; print(cols)
        data = data[cols]
        data['ID'] = test_idx
        data['target'] = Y


    return data
Example #6
0
def featurize_videogames(filename):
	'''

    Input: A cleaned version of the original dataset 
    Output: A pickled version of a subset of the raw data including all features that will be trained on.
    
    '''

	df = pd.read_pickle(filename)

	Spore = df[df['asin'] =='B000FKBCX4']
	Spore['title_name'] = 'Spore'

	Sim_City = df[df['asin'] == 'B007VTVRFA']
	Sim_City['title_name'] = 'SimCity'

	Diablo_3 = df[df['asin'] == 'B00178630A']
	Diablo_3['title_name'] = 'Diablo_3'

	Starcraft_2 = df[df['asin'] == 'B000ZKA0J6']
	Starcraft_2['title_name'] = 'Starcraft_2'

	Sid_Meiers_Civilization_V = df[df['asin'] == 'B0038TT8QM']
	Sid_Meiers_Civilization_V['title_name'] = 'Sid_Meiers_Civilization_V'

	subset_data = pd.concat([Diablo_3,Spore,Starcraft_2,Sim_City,Sid_Meiers_Civilization_V])
	corpus = subset_data['reviewText']
	sentiment = map(text_stats.get_sentiment,corpus)

	subset_data['rating'] = subset_data['overall']
	subset_data['percent_helpful'] = subset_data['helpful'].apply(lambda x: x[0] / float(x[1])) 
	subset_data['percent_helpful'] = subset_data['percent_helpful'].apply(lambda x: round(x,2)) 
	subset_data['title'] = pd.factorize(subset_data.title_name)[0]
	subset_data['review_year'] = subset_data['reviewTime'].apply(lambda x : x.split(',')[1])
	subset_data['review_year'] = pd.factorize(subset_data.review_year)[0]
	subset_data['pros_cons'] = map(text_stats.pros_cons,corpus)
	subset_data['punctuation_count'] = map(text_stats.count_punc,corpus)
	subset_data['word_count'] = corpus.apply(lambda x: len(x.split()))
	subset_data['num_char'] = map(text_stats.count_chars,corpus)
	subset_data['upper_case_count'] = map(text_stats.count_upper,corpus)
	subset_data['url_count'] = map(text_stats.count_urls,corpus)

	# Grabs only the first item of the list returned by get_sentiment which is polarity
	subset_data['polarity'] = [n[0] for n in sentiment]
	# Grabs only the second item of the list returned by get_sentiment which is subjectivity
	subset_data['subjectivity'] = [n[1] for n in sentiment]

	# Saves subsetted data to pickle
	with open('subset_'+filename.split('.')[0].split('_' , 1)[1]+'.pkl', 'w') as f:
			pickle.dump(subset_data, f)
Example #7
0
def process_data(X, y):
    X = X.drop(41, 1)
    X[1], uniques = pandas.factorize(X[1])
    X[2], uniques = pandas.factorize(X[2])
    X[3], uniques = pandas.factorize(X[3])

    num_examples = 10**6
    X = X[0:num_examples]
    y = y[0:num_examples]

    X = numpy.array(X)
    y = numpy.array(y).ravel()

    return X, y
Example #8
0
def Load_data():
    train = pd.read_csv(path_train)
    test = pd.read_csv(path_test)

    # combine train and test
    data_comb = train.append(test)

    # Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code
    # create any new variables    
    data_comb['Product_Info_2_char'] = data_comb.Product_Info_2.str[0]
    data_comb['Product_Info_2_num'] = data_comb.Product_Info_2.str[1]

    # factorize categorical variables
    data_comb['Product_Info_2'] = pd.factorize(data_comb['Product_Info_2'])[0]
    data_comb['Product_Info_2_char'] = pd.factorize(data_comb['Product_Info_2_char'])[0]
    data_comb['Product_Info_2_num'] = pd.factorize(data_comb['Product_Info_2_num'])[0]

    data_comb['BMI_Age'] = data_comb['BMI'] * data_comb['Ins_Age']

    med_keyword_columns = data_comb.columns[data_comb.columns.str.startswith('Medical_Keyword_')]
    data_comb['Med_Keywords_Count'] = data_comb[med_keyword_columns].sum(axis=1)

    print('Encode missing values')    
    data_comb.fillna(-1, inplace=True)

    # fix the dtype on the label column
    data_comb['Response'] = data_comb['Response'].astype(int)

    # split train and test
    train = data_comb[data_comb['Response']>0].copy()
    test = data_comb[data_comb['Response']<1].copy()

    target = train['Response'].values 
    le = preprocessing.LabelEncoder()
    y = le.fit_transform(target) 

    train.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
    test.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
    train = train.as_matrix()
    test = test.as_matrix()

    print('Construct labels for bumping')
    num_class = len(np.unique(target))
    labels = np.zeros(shape=(train.shape[0],num_class-1))
    labels[:, 0][target==1]=1
    labels[:, 6][target<8]=1
    for i in range(1, num_class-2):
        labels[:, i][target<i+2]=1
    return train, test, target, labels   
Example #9
0
def processCabin():
    """ Generate features from the Cabin variable

    Cabin numbers, when present, contain a single (or space-delimited list) cabin number that is composed of
    a letter and number with no space or other character between. This is a sparse variable: < 30% is populated
    """
    global df
    # Replace missing values with "U0"
    df['Cabin'][df.Cabin.isnull()] = 'U0'

    # create feature for the alphabetical part of the cabin number
    df['CabinLetter'] = df['Cabin'].map( lambda x : getCabinLetter(x))
    df['CabinLetter'] = pd.factorize(df['CabinLetter'])[0]

    # create binary features for each cabin letters
    if keep_binary:
        cletters = pd.get_dummies(df['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x))
        df = pd.concat([df, cletters], axis=1)

    # create feature for the numerical part of the cabin number
    df['CabinNumber'] = df['Cabin'].map( lambda x : getCabinNumber(x)).astype(int) + 1
    # scale the number to process as a continuous feature
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['CabinNumber_scaled'] = scaler.fit_transform(df['CabinNumber'])
Example #10
0
def dataTransform(filepath):
    import pandas as pd
    import numpy as np

    from sklearn.preprocessing import MinMaxScaler



    numeric_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
    all[ numeric_cols ] = all[ numeric_cols ].astype(float)
    all[ numeric_cols ]=all[ numeric_cols ].apply(lambda x: MinMaxScaler().fit_transform(x))
    x_num_all = all[ numeric_cols ].as_matrix()

    # categorical
    x_cat_all = all.drop( numeric_cols + [ 'readmitted'], axis = 1 )


    fac_x_cat_all = pd.DataFrame()
    cat_cols = list(x_cat_all.columns.values)
    for col in cat_cols:
        all_cur, _ = pd.factorize(cat_all[col])
        fac_x_cat_all[col] = all_cur

    fac_x_cat_all = fac_x_cat_all.as_matrix()

    x_all = np.hstack(( x_num_all,  fac_x_cat_all))
    y_all = all.readmitted

    return x_all, y_all
Example #11
0
def processName():
    global df
    # how many different names do they have?
    df['Names'] = df['Name'].map(lambda x: len(re.split(' ', x)))

    # what is each person's title?
    df['Title'] = df['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])

    # group low-occuring, related titles together
    df['Title'][df.Title == 'Jonkheer'] = 'Master'
    df['Title'][df.Title.isin(['Ms','Mlle'])] = 'Miss'
    df['Title'][df.Title == 'Mme'] = 'Mrs'
    df['Title'][df.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir'
    df['Title'][df.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'

    # Build binary features
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Title']).rename(columns=lambda x: 'Title_' + str(x))], axis=1)

    # process scaling
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Names_scaled'] = scaler.fit_transform(df['Names'])

    if keep_bins:
        df['Title_id'] = pd.factorize(df['Title'])[0]+1

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Title_id_scaled'] = scaler.fit_transform(df['Title_id'])
Example #12
0
def processAge():
    global df
    setMissingAges()

    # center the mean and scale to unit variance
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Age_scaled'] = scaler.fit_transform(df['Age'])

    # have a feature for children
    df['isChild'] = np.where(df.Age < 13, 1, 0)

    # bin into quartiles and create binary features
    df['Age_bin'] = pd.qcut(df['Age'], 4)
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Age_bin']).rename(columns=lambda x: 'Age_' + str(x))], axis=1)

    if keep_bins:
        df['Age_bin_id'] = pd.factorize(df['Age_bin'])[0]+1

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Age_bin_id_scaled'] = scaler.fit_transform(df['Age_bin_id'])

    if not keep_strings:
        df.drop('Age_bin', axis=1, inplace=True)
def polyFeatures(X, X_test, polyOrder=2, verbose=True):
    '''
        Given a set of matrices, we shall add a number of features,
        dependent upon 
    '''

    X_all = np.vstack((X, X_test))

    def colStack( cols ):
        strs = map( lambda m: '-'.join(map(str, m)), zip( *(X_all[:,c] for c in cols)) )
        return strs
    
    if verbose: print 'polynomial order: ', polyOrder

    orders = map(list, combinations(range(np.shape(X)[1]), polyOrder))
    N      = len(orders)

    if verbose: print 'Number of orders: ', N

    allLists = []
    for i, cols in enumerate(orders):
        print i+1, 'of', N, cols 
        vals = colStack( cols )
        newCol, _ = pd.factorize(  vals  )
        allLists.append(newCol)

    allLists = np.array(allLists)
        
    return allLists.T
Example #14
0
def processFare():
    global df

    # replace missing values as the median fare. Currently the datasets only contain one missing Fare value
    df['Fare'][ np.isnan(df['Fare']) ] = df['Fare'].median()

    # zero values cause problems with our division interaction variables so set to 1/10th of the lowest fare
    df['Fare'][ np.where(df['Fare']==0)[0] ] = df['Fare'][ df['Fare'].nonzero()[0] ].min() / 10

    # bin into quintiles for binary features
    df['Fare_bin'] = pd.qcut(df['Fare'], 4)
    if keep_binary:
        df = pd.concat([df, pd.get_dummies(df['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))], axis=1)

    if keep_bins:
        df['Fare_bin_id'] = pd.factorize(df['Fare_bin'])[0]+1

    # center and scale the fare to use as a continuous variable
    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Fare_scaled'] = scaler.fit_transform(df['Fare'])

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['Fare_bin_id_scaled'] = scaler.fit_transform(df['Fare_bin_id'])


    if not keep_strings:
        df.drop('Fare_bin', axis=1, inplace=True)
def cleanData(train, test):
    target = train['target']
    toDrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47']
    print 'Drop features:', toDrop
    trainDrop = ['ID', 'target']
    trainDrop.extend(toDrop)
    testDrop = ['ID']
    testDrop.extend(toDrop)
    train = train.drop(trainDrop, axis=1)
    test = test.drop(testDrop, axis=1) # test = test.drop(['ID','v22'], axis=1)
    
    for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()): # Iterator over (column name, Series) pairs
        if train_series.dtype == 'O':
            #for objects: factorize: to convert Object/String/Category to 0-based int value (index is -1 if None!!)
            #The pandas factorize function assigns each unique value in a series to a sequential, 0-based index, and calculates which index each series entry belongs to.
            train[train_name], tmp_indexer = pd.factorize(train[train_name])
            test[test_name] = tmp_indexer.get_indexer(test[test_name])
        else:
            #for int or float: fill NaN
            tmp_len = len(train[train_series.isnull()])
            if tmp_len>0:
                train.loc[train_series.isnull(), train_name] = train_series.median() #train_series.mean() #
            tmp_len = len(test[test_series.isnull()])
            if tmp_len>0:
                test.loc[test_series.isnull(), test_name] = train_series.median() #train_series.mean() #
    return train, target, test
Example #16
0
def unique_value_groups(ar, sort=True):
    """Group an array by its unique values.

    Parameters
    ----------
    ar : array-like
        Input array. This will be flattened if it is not already 1-D.
    sort : boolean, optional
        Whether or not to sort unique values.

    Returns
    -------
    values : np.ndarray
        Sorted, unique values as returned by `np.unique`.
    indices : list of lists of int
        Each element provides the integer indices in `ar` with values given by
        the corresponding value in `unique_values`.
    """
    inverse, values = pd.factorize(ar, sort=sort)
    groups = [[] for _ in range(len(values))]
    for n, g in enumerate(inverse):
        if g >= 0:
            # pandas uses -1 to mark NaN, but doesn't include them in values
            groups[g].append(n)
    return values, groups
Example #17
0
def setMissingAges(df):
    
    # Grab all the features that can be included in a Random Forest Regressor
    age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Title','Pclass']]
    
    # Split into sets with known and unknown Age values
    knownAge = age_df.loc[ (df.Age.notnull()) ]
    unknownAge = age_df.loc[ (df.Age.isnull()) ]
    
    # All age values are stored in a target array
    y = knownAge.values[:, 0]
    
    # All the other values are stored in the feature array
    X = knownAge.values[:, 1::]
    
    # Create and fit a model
    rtr = RandomForestRegressor(n_estimators=20, n_jobs=-1)
    rtr.fit(X, y)
    
    # Use the fitted model to predict the missing values
    predictedAges = rtr.predict(unknownAge.values[:, 1::])
    
    # Assign those predictions to the full data set
    df.loc[ (df.Age.isnull()), 'AgeFill' ] = predictedAges.astype(int)
    k = df['Age'].notnull()
    df['Age'][k] = pd.factorize(pd.cut(df[df['Age'].notnull()]['Age'],8))[0]
    df['Age'][~k] = df['Age'].max()+1
    #all[u'Age_bin'] = pd.factorize(pd.qcut(all[u'Age'],8))[0]
    df['Fare'] =
    
    return df
Example #18
0
def calc_MI_cate_feat_target(column, target, num_bins):

    vals, tmp_indexer = pd.factorize(column, na_sentinel=-1)

    p_neg = 0.238801
    p_pos = 0.761199

    max_cate = np.max(vals)
    densitys, bin_edges = np.histogram(vals, density=True)
    #print densitys


    #print 'start'
    final_mi = 0
    for level in xrange(-1, max_cate+1):
        p_cate_pos = np.sum((vals == level) & (target == 1)) / float(column.shape[0])
        p_cate_neg = np.sum((vals == level) & (target == 0)) / float(column.shape[0])
        p_cate = np.sum((vals == level)) / float(column.shape[0])
        if p_cate_pos == 0 or p_cate_neg == 0:
            continue
        final_mi += p_cate_pos * np.log2(p_cate_pos / (p_cate * p_pos))
        final_mi += p_cate_neg * np.log2(p_cate_neg / (p_cate * p_neg))
        #print '%d, %f' %(level, final_mi)

    return final_mi
    def factorize_data(self, x, cols, in_place=False):
        """Replace column in cols with one-hot representation of cols

        Parameters
        ----------
        x : np.ndarray
            Matrix with categorical data

        cols: tuple <int>
            Index of columns with categorical data

        Returns
        -------
        d : np.ndarray
            Matrix with categorical data replaced with one-hot rows
        """

        if in_place:
            data = x
        else:
            data = np.copy(x)

        factors_labels = {}
        for col in cols:
            factors, labels = pd.factorize(data[:,col])
            factors_labels[col] = (factors_labels)
            data[:,col] = factors

        return data, factor_labels
Example #20
0
def group_sums(x, group, use_bincount=True):
    """simple bincount version, again

    group : array, integer
        assumed to be consecutive integers

    no dtype checking because I want to raise in that case

    uses loop over columns of x

    for comparison, simple python loop
    """
    x = np.asarray(x)
    if x.ndim == 1:
        x = x[:, None]
    elif x.ndim > 2 and use_bincount:
        raise ValueError('not implemented yet')

    if use_bincount:

        # re-label groups or bincount takes too much memory
        if np.max(group) > 2 * x.shape[0]:
            group = pd.factorize(group)[0]

        return np.array([np.bincount(group, weights=x[:, col])
                         for col in range(x.shape[1])])
    else:
        uniques = np.unique(group)
        result = np.zeros([len(uniques)] + list(x.shape[1:]))
        for ii, cat in enumerate(uniques):
            result[ii] = x[g == cat].sum(0)
        return result
Example #21
0
def processName(df,keep_binary=False,keep_bins=False,keep_scaled=False):
    """
    Parameters:
        keep_binary:include 'Title_Mr' 'Title_Mrs'...
        keey_scaled&&keep_bins:include 'Names_scaled' 'Title_id_scaled'
    Note: the string feature 'Name' can be deleted
    """
    # how many different names do they have? this feature 'Names'
    df['Names']=df['Name'].map(lambda x:len(re.split('\\(',x)))
    
    #what is each person's title? 
    df['Title']=df['Name'].map(lambda x:re.compile(", (.*?)\.").findall(x)[0])
    #group low-occuring,related titles together
    df['Title'][df.Title.isin(['Mr','Don','Major','Capt','Jonkheer','Rev','Col','Sir','Dona'])] = 'Mr'
    df['Title'][df.Title.isin(['Master'])] = 'Master'
    df['Title'][df.Title.isin(['Countess','Mme','Mrs','Lady','the Countess'])] = 'Mrs'
    df['Title'][df.Title.isin(['Mlle','Ms','Miss'])] = 'Miss'
    df['Title'][(df.Title.isin(['Dr']))&(df['Sex']=='male')]='Mr'
    df['Title'][(df.Title.isin(['Dr']))&(df['Sex']=='female')]='Mrs'
    df['Title'][df.Title.isnull()][df['Sex']=='male']='Master'
    df['Title'][df.Title.isnull()][df['Sex']=='female']='Miss'
    #build binary features
    if keep_binary:
        df=pd.concat([df,pd.get_dummies(df['Title']).rename(columns=lambda x:'Title_'+str(x))],axis=1)
    #process_scaled
    if keep_scaled:
        scaler=preprocessing.StandardScaler()
        df['Names_scaled']=scaler.fit_transform(df['Names'])
    if keep_bins:
        df['Title_id']=pd.factorize(df['Title'])[0]+1
    if keep_bins and keep_scaled:
        scaler=preprocessing.StandardScaler()
        df['Title_id_scaled']=scaler.fit_transform(df['Title_id'])
    del df['Name']
    return df
Example #22
0
    def test_factorize_empty(self, data):
        labels, uniques = pd.factorize(data[:0])
        expected_labels = np.array([], dtype=np.intp)
        expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)

        tm.assert_numpy_array_equal(labels, expected_labels)
        self.assert_extension_array_equal(uniques, expected_uniques)
Example #23
0
  def __init__(self, data_file):
    self.dataFrame = pd.read_csv(data_file)
    for k in self.dataFrame.columns[1:]:
      self.dataFrame[k], _ = pd.factorize(self.dataFrame[k])

    self.classes = np.array(sorted(pd.Categorical(self.dataFrame['class']).categories))
    self.features = self.dataFrame.columns[self.dataFrame.columns != 'class']
Example #24
0
 def extract_puid_feats(self, df_data):    
     """
     Extract product uid feature.
     """
     df_feat = pd.DataFrame()    
     df_feat['p_uid'] = pd.factorize(df_data['product_uid'])[0]    
     saveit(df_feat, 'df_puid_feats')
Example #25
0
def factorise(df, na_sentinel):
    df = df.copy()  # Don't modify in place.
    for column_name in df.columns:
        factorised, uniques = pd.factorize(
            df[column_name], na_sentinel=na_sentinel)
        df[column_name] = factorised
    return df
def convert(data):
    number = preprocessing.LabelEncoder()
    for i in data.columns:
        if data[i].dtype == 'object':
            data[i] = pd.factorize(data[i])[0]
    data = data.fillna(-9999)
    return data
Example #27
0
def joint_factorize(train, test, column_name):
    joint_factors = pd.factorize(list(train[column_name]) + list(test[column_name]))
    train_length = len(train[column_name])
    range_train = range(train_length)
    train[column_name] = joint_factors[0][range_train]
    range_test = range(train_length,len(joint_factors[0]))
    test[column_name] = joint_factors[0][range_test]
Example #28
0
    def printDecisionTreeForBelow3(self):
        from sklearn.ensemble import RandomForestClassifier
        import pandas as pd
        import numpy as np
        from sklearn.cross_validation import train_test_split
        data = self.dataManager.loadData(["QueryName","TimeStamp","Sid","Aid","Country","IsFirst","Browser","Os","Continent"],transformFields=True)


        features = data.columns[1:4]
        result = data.groupby('Aid').apply(
            lambda group: (group.Sid.nunique()<=3)
        )
        below = result[result == True]
        data['IsBelow'] = data['Aid'].isin(below.index)
        y, _ = pd.factorize(data['IsBelow'])
        X_train, X_test, y_train, y_test = train_test_split(data[features], y, test_size=0.33, random_state=42)
        from sklearn.metrics import accuracy_score
        from sklearn import svm
        #clf = svm.SVC();
        clf = RandomForestClassifier(n_jobs=2)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        accuracy_score(y_test, y_pred)
        from sklearn import tree
        i_tree = 0
        for tree_in_forest in clf.estimators_:
            with open('tree_' + str(i_tree) + '.dot', 'w') as my_file:
                my_file = tree.export_graphviz(tree_in_forest, out_file = my_file)
            i_tree = i_tree + 1
Example #29
0
def runModel(X_train, Y_train, X_eval, var = "Ca", seed=42):
    #25% in Test Sample, 75% in train and cv
    X_train, Y_train, X_test, Y_test, X_train_location_labels = generateTrainTestSplit(X_train, Y_train, seed)
    
    
    labels = np.array(pd.factorize(X_train_location_labels)[0])
    
    #n_folds = 10
    n_folds = 10
    
    labels2 = (labels * (n_folds+1)/labels.max())
    labels2[-1]=n_folds
    
    #Model is Ridge Regression
    cv_iterator = LeaveOneLabelOut(labels2)
    #test = [0.00001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
    #modelCV = RidgeCV(test, normalize=True, score_func="mse", scoring="mse", cv=cv_iterator )
    #modelCV.fit(X_train, Y_train[var])
    
    if var == "Ca":
        prediction, test_score = CaModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator)
    elif var == "P":
        prediction, test_score = PModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator)
    elif var == "pH":
        prediction, test_score = pHModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator)
    elif var == "SOC":
        prediction, test_score = SOCModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator)
    elif var == "Sand":
        prediction, test_score = SandModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator)
    else:
        print "Error"

    
    return prediction, test_score
Example #30
0
def processTicket():
    """
    Generate features from the Ticket variable
    """
    global df

    df['TicketPrefix'] = df['Ticket'].map( lambda x : getTicketPrefix(x.upper()))
    df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('[\.?\/?]', '', x) )
    df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('STON', 'SOTON', x) )
    #print len(df['TicketPrefix'].unique()), "ticket codes:", np.sort(df['TicketPrefix'].unique())

    df['TicketPrefixId'] = pd.factorize(df['TicketPrefix'])[0]

    # create binary features for each cabin letters
    if keep_binary:
        prefixes = pd.get_dummies(df['TicketPrefix']).rename(columns=lambda x: 'TicketPrefix_' + str(x))
        df = pd.concat([df, prefixes], axis=1)

    df.drop(['TicketPrefix'], axis=1, inplace=True)

    df['TicketNumber'] = df['Ticket'].map( lambda x: getTicketNumber(x) )
    df['TicketNumberDigits'] = df['TicketNumber'].map( lambda x: len(x) ).astype(np.int)
    df['TicketNumberStart'] = df['TicketNumber'].map( lambda x: x[0:1] ).astype(np.int)
    #print np.sort(df.TicketNumberStart.unique())

    df['TicketNumber'] = df.TicketNumber.astype(np.int)
    #print np.sort(df['TicketNumber'])

    if keep_scaled:
        scaler = preprocessing.StandardScaler()
        df['TicketNumber_scaled'] = scaler.fit_transform(df['TicketNumber'])
Example #31
0
#def data_plotter(data):

if __name__ == '__main__':
    with open(
            'C://Users//k_mathin//PycharmProjects//Masters//ciena_trials//Kamal//data//vodafone_data_clusters_filtered.pkl',
            'rb') as f:
        data_set = pickle.load(f)
    data = []
    for d in data_set['data']:
        data.append(d)
    data = np.asarray(data)
    #data = data[:,:15]
    print(data.shape[0])
    label_data = np.asarray(data_set['osid'])
    labels, levels = pd.factorize(label_data)
    shelves = np.asarray(data_set['shelf'])
    cluster_num = levels.shape[0]
    print(cluster_num)
    clusters = kshape(zscore(data, axis=1), cluster_num)
    #clusters = kshape(data,cluster_num)
    y_pred = []
    for i in range(0, data.shape[0]):
        for j in range(0, cluster_num):
            if i in clusters[j][1]:
                y_pred.append(j)
                continue
    conf = conf_mat(labels, y_pred)

    print(conf_mat(labels, y_pred))
    print("done")
    ],
)  # 平均値で補間(数値列のみ)

# +
# User_Scoreを数値列にする
df["User_Score"] = df["User_Score"].astype("float")

# User_Scoreを文字列にする
df["Year_of_Release"] = df["Year_of_Release"].astype("str")
# -

# ラベルエンコディング
cate_cols = df.select_dtypes(
    include=["object", "category", "bool"]).columns.to_list()
for col in cate_cols:
    df[col], uni = pd.factorize(df[col])

# +
train = df.iloc[:train.shape[0]]
test = df.iloc[train.shape[0]:].reset_index(drop=True)

# 目的変数
sales_cols = [
    "NA_Sales",
    "EU_Sales",
    "JP_Sales",
    "Other_Sales",
    "Global_Sales",
]

train_drop_sales = train.drop(sales_cols, axis=1)
Example #33
0
2    58
dtype: int64
'''

categorize_map = {v: k for k, v in enumerate(cities.unique())}

# encoding the object as an enumerated type (categorical variable).
as_categories = cities.map(categorize_map)

print(as_categories.apply(sys.getsizeof))
'''
0    24
1    28
2    28
dtype: int64
'''
'''
notice immediately that memory usage is just about cut in half compared to when the full strings are used with object dtype.

Note: I used sys.getsizeof() to show the memory occupied by each individual value in the Series. Keep in mind these are Python objects that have some overhead in the first place. (sys.getsizeof('') will return 49 bytes.)

There is also colors.memory_usage(), which sums up the memory usage and relies on the .nbytes attribute of the underlying NumPy array. Don’t get too bogged down in these details: what is important is relative memory usage that results from type conversion, as you’ll see next.
'''

# Another way to do this same thing is with Pandas’ pd.factorize(colors):
# pandas.factorize encodes input values as an enumerated type or categorical variable

print(pd.factorize(cities)[0])
# [0 1 2]

# https://realpython.com/python-pandas-tricks/#reader-comments
dataset = df.copy()
dataset.pop('visit_date')
dataset.pop('min_visitors')
dataset.pop('median_visitors')
dataset.pop('max_visitors')
dataset.pop('count_observations')

# adding date to dataset
s = pd.Series(date)
df1 = pd.DataFrame({'date':s})
dataset=dataset.join(df1)

#adding store id
s = pd.Series(ids)
labels, levels = pd.factorize(s)
df1 = pd.DataFrame({'air_store_id':(labels)})
dataset.pop('air_store_id')
dataset=dataset.join(df1)


# In[6]:


train_dataset = dataset.sample(frac=0.8,random_state=0)
test_dataset = dataset.drop(train_dataset.index)


# In[7]:

def load_data(path_train=DATA_TRAIN_PATH,
              path_test=DATA_TEST_PATH,
              path_brand_phne=BRAND_PATH,
              events_path=EVENTS_PATH,
              app_events_path=APP_EVENTS_PATH,
              app_labels_path=APP_LABELS_PATH,
              labels_category_path=LABELS_CATEGORY):
    # Phone brand

    brand = pd.read_csv(path_brand_phne)
    brand.drop_duplicates('device_id', keep='first', inplace=True)
    brand['phone_brand'] = pd.factorize(brand['phone_brand'], sort=True)[0]
    # add new feature : the number of occurence (popularity) of each phne brand
    dict_phone = dict(brand.phone_brand.value_counts())
    brand['phone_brand_occurence'] = brand.phone_brand.apply(dict_phone.get)

    brand['device_model'] = pd.factorize(brand['device_model'], sort=True)[0]
    # add new feature : the number of occurence (popularity) of each phne brand
    dict_device = dict(brand.device_model.value_counts())
    brand['device_model_occurence'] = brand.device_model.apply(dict_device.get)

    train_loader = pd.read_csv(path_train)
    train = train_loader.drop(['age', 'gender'], axis=1)
    train['group'] = pd.factorize(train['group'], sort=True)[0]
    train = pd.merge(train, brand, how='left', on='device_id', left_index=True)
    gc.collect()

    #installed app
    appevents = pd.read_csv(app_events_path, header=0, nrows=10000)
    events = pd.read_csv(events_path, header=0)
    appencoder = LabelEncoder().fit(appevents.app_id)
    appevents['app'] = appencoder.transform(appevents.app_id)
    instaled_app = pd.merge(appevents,
                            events.loc[:, ['event_id', 'device_id']],
                            how='left',
                            on='event_id')
    #instaled_app.reset_index(inplace=True)
    gc.collect()
    #label data
    applabels = pd.read_csv(app_labels_path)
    labels_ctegory = pd.read_csv(labels_category_path)

    #factorize the labels category
    labels_ctegory.category = pd.factorize(labels_ctegory.category)[0]

    #merge the app labels and labels category
    applabels = pd.merge(applabels, labels_ctegory, on='label_id', how='left')
    gc.collect()

    #add new features: the number of occurance (popularity ) of each label id
    dict_label = dict(applabels.label_id.value_counts())
    applabels['label-occurence'] = applabels.label_id.apply(dict_label.get)
    gc.collect()
    #select only those in application events
    applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]

    #transforme the app_id with the encoder already defined
    applabels['app'] = appencoder.transform(applabels.app_id)

    # perform a one hot encoding for the label id
    labelencoder = LabelEncoder().fit(applabels.label_id)
    applabels['label'] = labelencoder.transform(applabels.label_id)
    gc.collect()
    #merge the installed_app and the applabels so by then to merge them with the train data
    label_features = pd.merge(
        instaled_app.loc[:, ['device_id', 'app']],
        applabels.loc[:, ['app', 'label', 'label-occurence', 'category']],
        on='app',
        how='left')
    #label_features.reset_index(inplace=True)

    # merge the train data with new set to mark For each device which apps it has installed

    #train = pd.merge(train , instaled_app , how = 'left',on='device_id')

    # merge the train data with label_features to mark For each device the label of the app used.

    train = pd.merge(train, label_features, how='left', on='device_id')
    train.fillna(-1, inplace=True)

    #add features:
    # 1 : #number of same app used per device:
    frame = pd.DataFrame(train.loc[:, ['device_id', 'app']].groupby(
        ['device_id', 'app']).size(),
                         columns=['nbr_same_app']).reset_index()
    gc.collect()
    train = pd.merge(train, frame, on=['device_id', 'app'], how='left')
    #rectify the 1 in the number of same app  for devices with no events
    train.loc[train.app == -1.0, 'nbr_same_app'] = -1

    # 2 : #number of same label used per device:
    frame = pd.DataFrame(train.loc[:, ['device_id', 'label']].groupby(
        ['device_id', 'label']).size(),
                         columns=['nbr_same_label']).reset_index()
    gc.collect()
    train = pd.merge(train, frame, on=['device_id', 'label'], how='left')
    #rectify the 1 in the number of same app  for devices with no events
    train.loc[train.app == -1.0, 'nbr_same_label'] = -1

    # 3 : #number of same category used per device:
    frame = pd.DataFrame(train.loc[:, ['device_id', 'category']].groupby(
        ['device_id', 'category']).size(),
                         columns=['nbr_same_category']).reset_index()
    gc.collect()
    train = pd.merge(train, frame, on=['device_id', 'category'], how='left')
    #rectify the 1 in the number of same app  for devices with no events
    train.loc[train.app == -1.0, 'nbr_same_category'] = -1

    #4 : number of occurence of each device
    dict_device = dict(train.loc[:, ['device_id', 'app']].groupby(
        ['device_id'])['app'].agg(np.size))
    train['device_occur'] = train.device_id.apply(dict_device.get)
    # 5 sum of labels

    dict_device = dict(train.loc[:, ['device_id', 'nbr_same_label']].groupby(
        ['device_id'])['nbr_same_label'].agg(np.sum))
    train['sum_of_labels'] = train.device_id.apply(dict_device.get)

    #6 sum of app
    dict_device = dict(train.loc[:, ['device_id', 'nbr_same_app']].groupby(
        ['device_id'])['nbr_same_app'].agg(np.sum))
    train['sum_of_app'] = train.device_id.apply(dict_device.get)

    #7 sum of category
    dict_device = dict(train.loc[:,
                                 ['device_id', 'nbr_same_category']].groupby([
                                     'device_id'
                                 ])['nbr_same_category'].agg(np.sum))
    train['sum_of_category'] = train.device_id.apply(dict_device.get)

    #done from adding new features , drop the deplicated device_ids
    train.drop_duplicates('device_id', keep='first', inplace=True)
    train.reset_index(drop=True, inplace=True)
    train.loc[train.app != -1, ['app', 'label', 'category']] = 1
    train.drop([
        'label-occurence', 'nbr_same_app', 'nbr_same_label',
        'nbr_same_category'
    ],
               axis=1,
               inplace=True)

    # target
    target = train.group

    #drop the device id and target from the train data
    train.drop(['device_id', 'group'], axis=1, inplace=True)

    test_loader = pd.read_csv(path_test)
    test = pd.merge(test_loader,
                    brand,
                    how='left',
                    on='device_id',
                    left_index=True)
    # merge the test data with new set to mark For each device which apps it has installed

    #test = pd.merge(test , instaled_app , how = 'left',on='device_id')
    # merge the train data with label_features to mark For each device the label of the app used.
    test = pd.merge(test, label_features, how='left', on='device_id')

    test.fillna(-1, inplace=True)

    # the same for the test set , we add the previous features , i proceeded in this manner (each one , test and train , alone) for the fact of memory
    #add features:
    # 1 : #number of times of the same app used per device:
    frame = pd.DataFrame(test.loc[:, ['device_id', 'app']].groupby(
        ['device_id', 'app']).size(),
                         columns=['nbr_same_app']).reset_index()
    gc.collect()
    test = pd.merge(test, frame, on=['device_id', 'app'], how='left')
    #rectify the 1 in the number of same app  for devices with no events
    test.loc[test.app == -1.0, 'nbr_same_app'] = -1

    # 2 : #number of times of the same label used per device:
    frame = pd.DataFrame(test.loc[:, ['device_id', 'label']].groupby(
        ['device_id', 'label']).size(),
                         columns=['nbr_same_label']).reset_index()
    gc.collect()
    test = pd.merge(test, frame, on=['device_id', 'label'], how='left')
    #rectify the 1 in the number of times of the same app  for devices with no events
    test.loc[test.app == -1.0, 'nbr_same_label'] = -1

    # 3 : #number of times of the same category used per device:
    frame = pd.DataFrame(test.loc[:, ['device_id', 'category']].groupby(
        ['device_id', 'category']).size(),
                         columns=['nbr_same_category']).reset_index()
    gc.collect()
    test = pd.merge(test, frame, on=['device_id', 'category'], how='left')
    #rectify the 1 in the number of times of the same app  for devices with no events
    test.loc[test.app == -1.0, 'nbr_same_category'] = -1

    #4 : number of occurence of each device
    dict_device = dict(test.groupby(['device_id'])['app'].agg(np.size))
    test['device_occur'] = test.device_id.apply(dict_device.get)

    # 5 sum of labels
    dict_device = dict(test.loc[:, ['device_id', 'nbr_same_label']].groupby(
        ['device_id'])['nbr_same_label'].agg(np.sum))
    test['sum_of_labels'] = test.device_id.apply(dict_device.get)

    #6 sum of app
    dict_device = dict(test.loc[:, ['device_id', 'nbr_same_app']].groupby(
        ['device_id'])['nbr_same_app'].agg(np.sum))
    test['sum_of_app'] = test.device_id.apply(dict_device.get)

    #7 sum of category
    dict_device = dict(test.loc[:, ['device_id', 'nbr_same_category']].groupby(
        ['device_id'])['nbr_same_category'].agg(np.sum))
    test['sum_of_category'] = test.device_id.apply(dict_device.get)

    test.drop_duplicates('device_id', keep='first', inplace=True)
    test.reset_index(drop=True, inplace=True)

    test.loc[test.app != -1, ['app', 'label', 'category']] = 1
    test.drop([
        'device_id', 'label-occurence', 'nbr_same_app', 'nbr_same_label',
        'nbr_same_category'
    ],
              axis=1,
              inplace=True)

    return train, test, target
Example #36
0
def distance_matrix(data,
                    numeric_distance="euclidean",
                    categorical_distance="jaccard"):
    """ Compute the pairwise distance attribute by attribute in order to account for different variables type:
        - Continuous
        - Categorical
        For ordinal values, provide a numerical representation taking the order into account.
        Categorical variables are transformed into a set of binary ones.
        If both continuous and categorical distance are provided, a Gower-like distance is computed and the numeric
        variables are all normalized in the process.
        If there are missing values, the mean is computed for numerical attributes and the mode for categorical ones.
        
        Note: If weighted-hamming distance is chosen, the computation time increases a lot since it is not coded in C 
        like other distance metrics provided by scipy.
        @params:
            - data                  = pandas dataframe to compute distances on.
            - numeric_distances     = the metric to apply to continuous attributes.
                                      "euclidean" and "cityblock" available.
                                      Default = "euclidean"
            - categorical_distances = the metric to apply to binary attributes.
                                      "jaccard", "hamming", "weighted-hamming" and "euclidean"
                                      available. Default = "jaccard"
        @returns:
            - the distance matrix
    """
    possible_continuous_distances = ["euclidean", "cityblock"]
    possible_binary_distances = [
        "euclidean", "jaccard", "hamming", "weighted-hamming"
    ]
    number_of_variables = data.shape[1]
    number_of_observations = data.shape[0]

    # Get the type of each attribute (Numeric or categorical)
    is_numeric = [
        all(isinstance(n, numbers.Number) for n in data.iloc[:, i])
        for i, x in enumerate(data)
    ]
    is_all_numeric = sum(is_numeric) == len(is_numeric)
    is_all_categorical = sum(is_numeric) == 0
    is_mixed_type = not is_all_categorical and not is_all_numeric

    # Check the content of the distances parameter
    if numeric_distance not in possible_continuous_distances:
        print("The continuous distance " + numeric_distance +
              " is not supported.")
        return None
    elif categorical_distance not in possible_binary_distances:
        print("The binary distance " + categorical_distance +
              " is not supported.")
        return None

    # Separate the data frame into categorical and numeric attributes and normalize numeric data
    if is_mixed_type:
        number_of_numeric_var = sum(is_numeric)
        number_of_categorical_var = number_of_variables - number_of_numeric_var
        data_numeric = data.iloc[:, is_numeric]
        data_numeric = (data_numeric - data_numeric.mean()) / (
            data_numeric.max() - data_numeric.min())
        data_categorical = data.iloc[:, [not x for x in is_numeric]]

    # Replace missing values with column mean for numeric values and mode for categorical ones. With the mode, it
    # triggers a warning: "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame"
    # but the value are properly replaced
    if is_mixed_type:
        data_numeric.fillna(data_numeric.mean(), inplace=True)
        for x in data_categorical:
            data_categorical[x].fillna(data_categorical[x].mode()[0],
                                       inplace=True)
    elif is_all_numeric:
        data.fillna(data.mean(), inplace=True)
    else:
        for x in data:
            data[x].fillna(data[x].mode()[0], inplace=True)

    # "Dummifies" categorical variables in place
    if not is_all_numeric and not (categorical_distance == 'hamming' or
                                   categorical_distance == 'weighted-hamming'):
        if is_mixed_type:
            data_categorical = pd.get_dummies(data_categorical)
        else:
            data = pd.get_dummies(data)
    elif not is_all_numeric and categorical_distance == 'hamming':
        if is_mixed_type:
            data_categorical = pd.DataFrame([
                pd.factorize(data_categorical[x])[0] for x in data_categorical
            ]).transpose()
        else:
            data = pd.DataFrame([pd.factorize(data[x])[0]
                                 for x in data]).transpose()

    if is_all_numeric:
        result_matrix = cdist(data, data, metric=numeric_distance)
    elif is_all_categorical:
        if categorical_distance == "weighted-hamming":
            result_matrix = weighted_hamming(data)
        else:
            result_matrix = cdist(data, data, metric=categorical_distance)
    else:
        result_numeric = cdist(data_numeric,
                               data_numeric,
                               metric=numeric_distance)
        if categorical_distance == "weighted-hamming":
            result_categorical = weighted_hamming(data_categorical)
        else:
            result_categorical = cdist(data_categorical,
                                       data_categorical,
                                       metric=categorical_distance)
        result_matrix = np.array([[
            1.0 * (result_numeric[i, j] * number_of_numeric_var +
                   result_categorical[i, j] * number_of_categorical_var) /
            number_of_variables for j in range(number_of_observations)
        ] for i in range(number_of_observations)])

    # Fill the diagonal with NaN values
    np.fill_diagonal(result_matrix, np.nan)

    return pd.DataFrame(result_matrix)
train_test.describe()

## Apply log transformation
for feats in skewed_feats:
    train_test[feats] = train_test[feats] + 1
    train_test[feats] = np.log(train_test[feats])
train_test.describe()

#Identify categorical features
features = train.columns
cats = [feat for feat in features if 'cat' in feat]
print(cats)

# factorize categorical features
for feat in cats:
    train_test[feat] = pd.factorize(train_test[feat], sort=True)[0]

train_test['cat100'][:10]

# Split back into test and train
x_train = train_test.iloc[:ntrain, :]
x_test = train_test.iloc[ntrain:, :]

train = x_train.copy()
test = x_test.copy()
train.shape
train.head()

## Transform target into np log of loss
train_labels = np.log(np.array(train_loader['loss']))
train_ids = train_loader['id'].values.astype(np.int32)
Example #38
0
def read_data(files,
              per_chromosome=False,
              chromosomes=[None],
              binarize=True,
              enable_collapse_strands=True,
              drop_ambiguous=True,
              outfile=None,
              verbose=True):
    ''' Reads all specified files and converts them to a sparse matrix in anndata format.

    Parameters:
    files: List of files to containing the individual cell assays.
    per_chromosome: boolean, whether to output an individual anndata file per_chromosome.
    Chromosomes are automatically inferred from the first file or can be specified in chromosomes.
    chromosomes: list of strings, which chromosomes to generate an output for.
    binarize: boolean, whether to return a binary methylation rate
    enable_collapse_strands: boolean, whether to sum reads from neighboring
    methylation sites
    drop_ambiguous: boolean, whether to drop sites with 0.5 methylation rate
    drop_rate_columns: boolean, whether columns met_reads and nonmet_reads
    should be removed
    outfile: str, Prefix name of output file
    verbose: boolean, whether verbose log output should be printed

    Returns:
    pd.DataFrame with added rate column
    '''
    # Get cellnames from input files
    cellnames = [
        re.sub('\\.csv|\\.txt|\\.tsv|\\.gz', '', os.path.basename(file))
        for file in files
    ]

    # Find which chromosomes to read if per chromosome
    if per_chromosome:
        if chromosomes[0] is None:
            # Read first file to find which chromosomes exist
            chromosomes = pd.unique(
                pd.read_csv(files[0],
                            sep='\t',
                            usecols=[0],
                            skiprows=1,
                            header=None).iloc[:, 0])

    # Read Files and generate anndata formatted files
    for chromosome in chromosomes:
        allmet = read_cells(files, chromosome)
        allmet = [
            calculate_met_rate(met,
                               binarize=binarize,
                               enable_collapse_strands=enable_collapse_strands,
                               drop_ambiguous=drop_ambiguous) for met in allmet
        ]
        if verbose: print('Concatenating...')
        allmet = pd.concat(allmet, keys=cellnames, copy=False)
        allmet.sort_values(['chr', 'location'], inplace=True)
        allmet.met_rate = allmet.met_rate + 1

        print('Constructing reference index...')
        allmet['ind'] = make_genomic_index(allmet.location.values)
        rowcoord, rownames = pd.factorize(allmet.index.get_level_values(
            0))  # Numerical representation of the index objects
        print('Constructing sparse matrix...')
        obs = pd.DataFrame(index=rownames)
        var = allmet.drop_duplicates(subset='ind')[['chr', 'location'
                                                    ]].reset_index(drop=True)
        metmat = sparse.coo_matrix(
            (allmet.met_rate, (rowcoord, allmet['ind'])),
            dtype=np.int64,
            shape=(obs.shape[0], var.shape[0]))
        metmat = metmat.tocsc()
        print('Constructing anndata object...')
        a = anndata.AnnData(X=metmat, obs=obs, var=var, dtype=np.int32)
        if outfile is not None:
            print('Writing h5ad file')
            if chromosome is not None:
                chr_outfile = re.sub('$', '_chr_%s.h5ad' % chromosome,
                                     re.sub('.h5ad', '', outfile))
            else:
                chr_outfile = outfile
            a.write(chr_outfile)
            a.file.close()
        if not per_chromosome:
            return a
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from scipy.stats import pearsonr

#Read Allstate dataset
allstate = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
allstate.head()
test.head()

#Factorize categorical variables
feature_col = list(allstate.columns[1:-1])

cats = [name for name in feature_col if 'cat' in name]
for name in cats:
    allstate[name] = pd.factorize(allstate[name], sort=True)[0]
    test[name] = pd.factorize(test[name], sort=True)[0]

#Data Exploration
len(cats)  #Number of categorical variables
len(feature_col) - len(cats)  #Number of numerical variables
allstate.isnull().sum().sum()  #Check for missing value
allstate['loss'].describe()  #Range of target variable is huge

#EDA on loss
sns.distplot(allstate["loss"])  #We should scaling targer variable
plt.ylabel("Density")
sns.distplot(np.log(allstate["loss"]))  #Ideal normal distribution
plt.xlabel("Log(loss)")
plt.ylabel("Density")
def train_test(num_rows=None):
    print("Loading datasets...")
    # load datasets
    train_df = pd.read_csv('../input/train.tsv', sep='\t', nrows=num_rows)
    test_df = pd.read_csv('../input/test.tsv', sep='\t', nrows=num_rows)
    print("Train samples: {}, test samples: {}".format(len(train_df),
                                                       len(test_df)))

    #testのtargetをnanにしときます
    test_df['visitors'] = np.nan

    # merge
    df = train_df.append(test_df[['datetime', 'park',
                                  'visitors']]).reset_index()

    del train_df, test_df
    gc.collect()

    # 日付をdatetime型へ変換
    df['datetime'] = pd.to_datetime(df['datetime'])

    # 日本の祝日データを追加
    df['japanese_holiday'] = getJapaneseHolidays(df['datetime']).replace(2, 1)

    # 連休数のファクターを生成
    holidays = df.groupby('datetime')['japanese_holiday'].mean().replace(2, 1)
    holidays = fillHolidays(holidays).replace(2, 1)  # 休日の谷間の平日を休日にする
    df['num_holidays'] = df['datetime'].map(getNumHolidays(holidays))

    # 季節性の特徴量を追加
    df['day'] = df['datetime'].dt.day.astype(object)
    df['month'] = df['datetime'].dt.month.astype(object)
    df['weekday'] = df['datetime'].dt.weekday.astype(object)
    df['weekofyear'] = df['datetime'].dt.weekofyear.astype(object)
    #    df['day_month'] = df['day'].astype(str)+'_'+df['month'].astype(str)
    #    df['day_weekday'] = df['day'].astype(str)+'_'+df['weekday'].astype(str)
    #    df['day_weekofyear'] = df['day'].astype(str)+'_'+df['weekofyear'].astype(str)
    df['month_weekday'] = df['month'].astype(str) + '_' + df['weekday'].astype(
        str)
    df['month_weekofyear'] = df['month'].astype(
        str) + '_' + df['weekofyear'].astype(str)
    #    df['weekday_weekofyear'] = df['weekday'].astype(str)+'_'+df['weekofyear'].astype(str)
    df['new_years_day'] = getNewYearsDay(df['datetime'])
    df['golden_week'] = getGoldenWeek(df['datetime'])

    #    df['park_day'] = df['park'].astype(str)+'_'+df['day'].astype(str)
    df['park_month'] = df['park'].astype(str) + '_' + df['month'].astype(str)
    df['park_weekday'] = df['park'].astype(str) + '_' + df['weekday'].astype(
        str)
    df['park_japanese_holiday'] = df['park'].astype(
        str) + '_' + df['japanese_holiday'].astype(str)
    #    df['park_weekofyear'] = df['park'].astype(str)+'_'+df['weekofyear'].astype(str)
    df['park_num_holiday'] = df['park'].astype(
        str) + '_' + df['num_holidays'].astype(str)
    df['park_new_years_day'] = df['park'].astype(
        str) + '_' + df['new_years_day'].astype(str)
    df['park_golden_week'] = df['park'].astype(
        str) + '_' + df['golden_week'].astype(str)

    # categorical変数を変換
    df_res, cat_cols = one_hot_encoder(df, nan_as_category=False)

    # stratify & mearge用
    df_res['park'] = df['park']
    df_res['weekofyear'] = df['weekofyear'].astype(int)
    df_res['weekday'] = df['weekday'].astype(int)
    df_res['year'] = df['datetime'].dt.year.astype(int)
    df_res['month'] = df['datetime'].dt.month.astype(int)
    df_res['park_month'], _ = pd.factorize(df['park_month'])
    df_res['park_japanese_holiday'], _ = pd.factorize(
        df['park_japanese_holiday'])
    #    df_res['ISESHIMA_summit'] = ((df['park']=='伊勢志摩国立公園')&df['japanese_holiday']&('2016-5-27'>df['datetime'])&(df['datetime']>'2015-6-5')).astype(int) # 2016年伊勢島サミット開催決定後の休日フラグ

    return df_res
Example #41
0
def scvi(
    adata: AnnData,
    n_hidden: int = 128,
    n_latent: int = 10,
    n_layers: int = 1,
    dispersion: str = "gene",
    n_epochs: int = 400,
    lr: int = 1e-3,
    train_size: int = 1.0,
    batch_key: Optional[str] = None,
    use_highly_variable_genes: bool = True,
    subset_genes: Optional[Sequence[Union[int, str]]] = None,
    linear_decoder: bool = False,
    copy: bool = False,
    use_cuda: bool = True,
    return_posterior: bool = True,
    trainer_kwargs: dict = {},
    model_kwargs: dict = {},
) -> Optional[AnnData]:
    """\
    SCVI [Lopez18]_.

    Fits scVI model onto raw count data given an anndata object

    scVI uses stochastic optimization and deep neural networks to aggregate information 
    across similar cells and genes and to approximate the distributions that underlie
    observed expression values, while accounting for batch effects and limited sensitivity.

    To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.),
    set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can 
    be used to inspect which genes contribute to variation in the dataset. It may also be used
    for all scVI tasks, like differential expression, batch correction, imputation, etc.
    However, batch correction may be less powerful as it assumes a linear model.

    .. note::
        More information and bug reports `here <https://github.com/YosefLab/scVI>`__.

    Parameters
    ----------
    adata
        An anndata file with `X` attribute of unnormalized count data
    n_hidden
        Number of nodes per hidden layer
    n_latent
        Dimensionality of the latent space
    n_layers
        Number of hidden layers used for encoder and decoder NNs
    dispersion
        One of the following
        * `'gene'` - dispersion parameter of NB is constant per gene across cells
        * `'gene-batch'` - dispersion can differ between different batches
        * `'gene-label'` - dispersion can differ between different labels
        * `'gene-cell'` - dispersion can differ for every gene in every cell
    n_epochs
        Number of epochs to train
    lr
        Learning rate
    train_size
        The train size, either a float between 0 and 1 or an integer for the number of training samples to use
    batch_key
        Column name in anndata.obs for batches. 
        If None, no batch correction is performed
        If not None, batch correction is performed per batch category
    use_highly_variable_genes
        If true, uses only the genes in anndata.var["highly_variable"]
    subset_genes
        Optional list of indices or gene names to subset anndata. 
        If not None, use_highly_variable_genes is ignored
    linear_decoder
        If true, uses LDVAE model, which is an implementation of [Svensson20]_.
    copy
        If true, a copy of anndata is returned
    return_posterior
        If true, posterior object is returned
    use_cuda
        If true, uses cuda
    trainer_kwargs
        Extra arguments for UnsupervisedTrainer
    model_kwargs
        Extra arguments for VAE or LDVAE model
    
    Returns
    -------
    If `copy` is true, anndata is returned.
    If `return_posterior` is true, the posterior object is returned
    If both `copy` and `return_posterior` are true, 
    a tuple of anndata and the posterior are returned in that order. 

    `adata.obsm['X_scvi']` stores the latent representations
    `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial
    `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial
    
    If linear_decoder is true:
    `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a
    genes by n_latent matrix.

    """

    try:
        from scvi.models import VAE, LDVAE
        from scvi.inference import UnsupervisedTrainer
        from scvi.dataset import AnnDatasetFromAnnData
    except ImportError:
        raise ImportError(
            "Please install scvi package from https://github.com/YosefLab/scVI"
        )

    # check if observations are unnormalized using first 10
    # code from: https://github.com/theislab/dca/blob/89eee4ed01dd969b3d46e0c815382806fbfc2526/dca/io.py#L63-L69
    if len(adata) > 10:
        X_subset = adata.X[:10]
    else:
        X_subset = adata.X
    norm_error = (
        'Make sure that the dataset (adata.X) contains unnormalized count data.'
    )
    if sp.sparse.issparse(X_subset):
        assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error
    else:
        assert np.all(X_subset.astype(int) == X_subset), norm_error

    if subset_genes is not None:
        adata_subset = adata[:, subset_genes]
    elif use_highly_variable_genes and "highly_variable" in adata.var:
        adata_subset = adata[:, adata.var["highly_variable"]]
    else:
        adata_subset = adata

    if batch_key is not None:
        codes, uniques = pd.factorize(adata_subset.obs[batch_key])
        adata_subset.obs['_tmp_scvi_batch'] = codes
        n_batches = len(uniques)
    else:
        n_batches = 0

    dataset = AnnDatasetFromAnnData(adata_subset.copy(), batch_label='_tmp_scvi_batch')

    if linear_decoder:
        vae = LDVAE(
            n_input=dataset.nb_genes,
            n_batch=n_batches,
            n_labels=dataset.n_labels,
            n_hidden=n_hidden,
            n_latent=n_latent,
            n_layers_encoder=n_layers,
            dispersion=dispersion,
            **model_kwargs,
        )

    else:
        vae = VAE(
            dataset.nb_genes,
            n_batch=n_batches,
            n_labels=dataset.n_labels,
            n_hidden=n_hidden,
            n_latent=n_latent,
            n_layers=n_layers,
            dispersion=dispersion,
            **model_kwargs,
        )

    trainer = UnsupervisedTrainer(
        model=vae,
        gene_dataset=dataset,
        use_cuda=use_cuda,
        train_size=train_size,
        **trainer_kwargs,
    )

    trainer.train(n_epochs=n_epochs, lr=lr)

    full = trainer.create_posterior(
        trainer.model, dataset, indices=np.arange(len(dataset))
    )
    latent, batch_indices, labels = full.sequential().get_latent()

    if copy:
        adata = adata.copy()

    adata.obsm['X_scvi'] = latent
    adata.obsm['X_scvi_denoised'] = full.sequential().get_sample_scale()
    adata.obsm['X_scvi_sample_rate'] = full.sequential().imputation()

    if linear_decoder:
        loadings = vae.get_loadings()
        df = pd.DataFrame(loadings, index=adata_subset.var_names)
        adata.uns['ldvae_loadings'] = df

    if copy and return_posterior:
        return adata, full
    elif copy:
        return adata
    elif return_posterior:
        return full
Example #42
0
from sklearn import preprocessing
import matplotlib.pyplot as plt
import os
import pickle
from sklearn.model_selection import GridSearchCV
from time import time
from scipy.stats import randint as sp_randint

#===================================================================
# Load Data
os.chdir(os.getenv("HOME") + "/Desktop/Dropbox/CS229/Project")

d = pd.read_table("./data/svz_data.txt", sep=",")
d = d.drop("Unnamed: 0", axis=1)

y = pd.factorize(d.Age)[0]
X = d.iloc[:, 7:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2, shuffle=True)

#===================================================================
# Basic rf on full dataset
classifier = RandomForestClassifier(n_estimators=200, n_jobs=3, random_state=0)
classifier.fit(X_train, y_train)
acc_train = classifier.score(X_train, y_train) # 0.72
acc_test = classifier.score(X_test, y_test) # 0.71
pred = classifier.predict(X_test)



#===================================================================
Example #43
0
# importing packages
import pandas as pd
import numpy as np
# preprocessing the data
data = pd.read_csv('spam.csv', encoding='latin-1')
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
# encoding Data
data['label'] = pd.factorize(data['label'])[0]
# module for removing unwanted words
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# for stemming words
from nltk.stem.porter import PorterStemmer
temp = []
for row in data.itertuples():
    # to keep a - z letters and 0 - 9
    rev = re.sub("[^0-9a-zA-Z]", " ", row[2])
    rev = rev.lower()
    rev = rev.split()
    ps = PorterStemmer()
    rev = [
        ps.stem(word) for word in rev
        if not word in set(stopwords.words("english"))
    ]
    rev = " ".join(rev)
    temp.append(rev)
data['msg'] = temp
def casePlot(case_index: List[int],
             start: List[dt.date],
             end: List[dt.date],
             hue: List[str] = None,
             hue_lines: List[str] = None,
             hue_lines_colors: List[str] = None,
             na_values=dt.date.today(),
             ax: Axes = None):

    if ax is None:
        fig, ax = plt.subplots()

    if type(start) != list:
        start = list(start)
    if type(end) != list:
        end = list(end)
    if type(case_index) != list:
        case_index = list(case_index)

    # samling ad datoer, og case_index
    date = start + end
    case_index = case_index + case_index

    # Plot - get dots
    if hue is None:
        sns.scatterplot(x=date, y=case_index, color='blue', ax=ax)
    else:
        if type(hue) != list and hue is not None:
            hue = list(hue)

        hue = hue + hue
        sns.scatterplot(x=date, y=case_index, hue=hue, ax=ax)

    # Lines in plt
    isUnfinished = pd.isnull(end)
    end__ = [
        pd.Timestamp(na_values) if unfinished else e
        for e, unfinished in zip(end, isUnfinished)
    ]

    if hue_lines is None:
        plt.hlines(case_index, xmin=start, xmax=end__)

    # if hue_lines non-empty
    elif hue_lines is not None:

        if hue_lines_colors is None:
            hue_lines_colors = [
                "#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e",
                "#2ecc71"
            ]
        for i in range(10):
            hue_lines_colors = hue_lines_colors + hue_lines_colors
        cols = hue_lines_colors
        mapIntToCol = dict(enumerate(cols)).get
        hue_line_colors = list(map(mapIntToCol, factorize(hue_lines)[0]))
        hue_lines_unique = uniqueList(hue_lines)

        plt.hlines(case_index,
                   xmin=start,
                   xmax=end__,
                   colors=hue_line_colors,
                   zorder=1)

        # legend
        handles_lines = [
            mlines.Line2D([], [],
                          color=hue_lines_colors[i],
                          marker='_',
                          markersize=15,
                          label=label)
            for i, label in enumerate(hue_lines_unique)
        ]

        if hue is not None:
            ax.legend(handles=ax.legend_.legendHandles + handles_lines)
        else:
            ax.legend(handles=handles_lines)

    ax.set(xlabel='Year', ylabel='Case Index')
    plt.gca().invert_yaxis()
    plt.yticks(ticks=case_index, labels=case_index)
Example #45
0
def convert_data(df):
    df.Sex[df['Sex'] == 'male'] = 1
    df.Sex[df['Sex'] == 'female'] = 0
    df['fare_bin'] = pd.qcut(df.Fare, 4)
    df['fare_id'] = pd.factorize(df.fare_bin)[0] + 1
    df['embarked_id'] = pd.factorize(df.Embarked)[0] + 1
def evaluate(predictor, args):
    
    train_dir = args.train_dir
    train_file = args.filename
    test_file = train_file.replace('train', 'test', 1)
    target = args.target
    training_job_name = args.training_job_name
    s3_output = args.s3_output
    presets = args.presets 

    dataset_name = train_file.split('_')[0]
    logging.info(dataset_name)
    
    test_data = TabularDataset(os.path.join(train_dir, test_file))   
    
    u = urlparse(s3_output, allow_fragments=False)
    bucket = u.netloc
    logging.info(bucket)
    prefix = u.path.strip('/')
    logging.info(prefix)
    s3 = boto3.client('s3')
    
    y_test = test_data[target]
    test_data_nolab = test_data.drop(labels=[target], axis=1)
    
    y_pred = predictor.predict(test_data_nolab)
    y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred})
    pred_file = f'{dataset_name}_test_predictions.csv'
    y_pred_df.to_csv(pred_file, index=False, header=True)

    leaderboard = predictor.leaderboard()
    lead_file = f'{dataset_name}_leaderboard.csv'
    leaderboard.to_csv(lead_file)
    
    perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
    #del perf['confusion_matrix']
    perf_file = f'{dataset_name}_model_performance.txt'
    with open(perf_file, 'w') as f:
        print(json.dumps(perf, indent=4, default=pd.DataFrame.to_json), file=f)

    summary = predictor.fit_summary()
    summ_file = f'{dataset_name}_fit_summary.txt'
    with open(summ_file, 'w') as f:
        print(summary, file=f)
    
    y_prob = predictor.predict_proba(test_data_nolab)
    y_prob = y_prob.iloc[:,-1]    
    
    y_test_enc, uniques = pd.factorize(y_test)  # Label Encoding  
            
    fig = plt.figure(figsize=(14,4))
    plt.subplot(1,3,1)
    plot_roc_curve(y_test_enc, y_prob)
    plt.subplot(1,3,2)    
    plot_pr_curve(y_test_enc, y_prob)
    plt.subplot(1,3,3)    
    plot_conf_mtx(y_test_enc, y_prob, 0.5) 
    eval_file = f'{dataset_name}_eval.png'
    plt.savefig(eval_file)
    plt.close(fig)

#     # Feature importance
#     featimp = predictor.feature_importance(test_data)
#     fig, ax = plt.subplots(figsize=(12,5))
#     plot = sns.barplot(x=featimp.index, y=featimp.values)
#     ax.set_title('Feature Importance')
#     plot.set_xticklabels(plot.get_xticklabels(), rotation='vertical')
#     featimp_imgfile = f'{dataset_name}_featimp.png'
#     featimp_csvfile = f'{dataset_name}_featimp.csv'
#     fig.savefig(featimp_imgfile)
#     featimp.to_csv(featimp_csvfile)
#     plt.close(fig)        
        
    # Cleanup data in order to avoid disk space issues
    predictor.save_space()
    predictor.delete_models(models_to_keep='best', dry_run=False)

    files_to_upload = [pred_file, lead_file, perf_file, summ_file, eval_file]
    for file in files_to_upload:
        s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))   
Example #47
0
import numpy as np

iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
print(len(df))

train, test = df[df['is_train'] == True], df[df['is_train'] == False]

features = df.columns[:4]
#print(train[features].values.tolist())
#print(test.loc[2])#[4.9,3,1.4,0.2]
#模型
clf = RandomForestClassifier(n_estimators=100, n_jobs=2)
y, _ = pd.factorize(train['species'])
#print(pd.factorize(train['species']))
#y2,_pd.factorize(test['species'])
#训练
clf.fit(train[features], y)

aa = clf.predict([[6.5, 3.0, 5.2, 2.0]])
print(aa)

preds = iris.target_names[clf.predict(test[features])]
#print(preds)
y2 = pd.crosstab(test['species'],
                 preds,
                 rownames=['actual'],
                 colnames=['preds'])
Example #48
0
from sklearn.metrics import f1_score
#extract accident_id, 1st_road_class, speed_limit, road_surface_conditions, target
accident_columns = [
    "accident_id", "1st_road_class", "speed_limit", "road_surface_conditions",
    "time"
]
vehicle_columns = ["accident_id", "Vehicle_Type", "Age_of_Driver"
                   ]  #note, age of driver isn't always available

accidents_train = pd.read_csv("data/train.csv",
                              usecols=accident_columns + ["target"])
accidents_unseen = pd.read_csv("data/test.csv", usecols=accident_columns)
vehicles = pd.read_csv("data/vehicles.csv", usecols=vehicle_columns)

#split features and target
target = pd.factorize(accidents_train["target"])[0]
train = accidents_train.drop(axis=1, labels=["target"])

#---------------------preproccesing--------------------------#

#join and drop empty values (driver age)
print(accidents_unseen.shape)
train = train.merge(vehicles,
                    on='accident_id',
                    suffixes=('_accidents', '_vehicles')).fillna(999)
unseen = accidents_unseen.merge(vehicles,
                                on='accident_id',
                                suffixes=('_accidents',
                                          '_vehicles')).fillna(999)
print(train.shape)
#create 24 hour buckets
def prepare_data(df1):
    df=df1.copy()
    df['DOB1']=df['DOB'].map(lambda x: datetime(1900+int(x[6:]),int(x[3:5]),int(x[:2])) if len(str(x))>=8 else datetime(1997,1,1))
    df['Lead_Creation_Date1']=df['Lead_Creation_Date'].map(lambda x: datetime(2000+int(x[6:]),int(x[3:5]),int(x[:2])) if len(str(x))>=8 else datetime(1997,1,1))
    # for i in range(12):
        # df['FEAT_ID_%d'%i],_=pd.factorize(df.index.map(lambda x: x[3+i]))
    df['FEAT_GENDER'],_=pd.factorize(df['Gender'])
    df['FEAT_DOB_YEAR'],_=pd.factorize(df['DOB1'].dt.year)
    df['FEAT_DOB_MONTH'],_=pd.factorize(df['DOB1'].dt.month)
    df['FEAT_LCD_MONTH'],_=pd.factorize(df['Lead_Creation_Date1'].dt.month)
    df['FEAT_LCD_WEEK'],_=pd.factorize(df['Lead_Creation_Date1'].dt.week)
    df['FEAT_LCD_DOW'],_=pd.factorize(df['Lead_Creation_Date1'].dt.dayofweek)
    df['FEAT_DOB_MONTH'],_=pd.factorize(df['DOB1'].dt.month)
    df['FEAT_DOB_LCD_MONTH_DIFF'],_=pd.factorize(df['Lead_Creation_Date1'].dt.month-df['DOB1'].dt.month)
    df['FEAT_DOB_LCD_WEEK_DIFF'],_=pd.factorize(np.clip(df.Lead_Creation_Date1.dt.week-df.DOB1.dt.week,-10,10))
    df['FEAT_DOB_LCD_DAY_DIFF'],_=pd.factorize(np.clip(df.Lead_Creation_Date1.dt.dayofyear-df.DOB1.dt.dayofyear,-10,10))

    for i in range(6):
        df['FEAT_CITY1_%d'%i],_=pd.factorize(df['City_Code'].map(lambda x: str(x)[:i+1]))
    df['FEAT_CITY2'],_=pd.factorize(df['City_Category'])
    for i in range(4):
        df['FEAT_EMP1_%d'%i],_=pd.factorize(df['Employer_Code'].map(lambda x: str(x)[:-i-1]))
    df['FEAT_EMP2'],_=pd.factorize(df['Employer_Category1'])
    df['FEAT_EMP3'],_=pd.factorize(df['Employer_Category2'])
    df['FEAT_INCOME1']=np.clip(np.log1p(df['Monthly_Income']),0,10).astype(np.int64)
    df['CONT_FEAT_INCOME1']=np.clip(df['Monthly_Income'].fillna(-99999),-99999,9500)
    df['FEAT_INCOME2']=df['Monthly_Income'].astype(np.int64)%10
    df['FEAT_BANK1'],_=pd.factorize(df['Customer_Existing_Primary_Bank_Code'])
    df['FEAT_BANK2'],_=pd.factorize(df['Customer_Existing_Primary_Bank_Code'].map(lambda x: str(x)[:-1]))
    df['FEAT_BANK3'],_=pd.factorize(df['Primary_Bank_Type'])
    df['FEAT_CONTACT1'],_=pd.factorize(df['Contacted'])
    df['FEAT_CONTACT2'],_=pd.factorize(df['Source'])
    df['FEAT_CONTACT3'],_=pd.factorize(df['Source'].map(lambda x: x[:3]))
    df['FEAT_CONTACT4'],_=pd.factorize(df['Source'].map(lambda x: x[3]))
    df['FEAT_CONTACT5'],_=pd.factorize(df['Source_Category'])
    df['FEAT_EMI1'],_=pd.factorize(df['Existing_EMI']==0)
    df['FEAT_EMI2'],_=pd.factorize(pd.cut(df['Existing_EMI'].fillna(-99999),[-99999,0,100,300,600,1200,2400,3600,10000000]))
    df['FEAT_EMI3'],_=pd.factorize(df['EMI']==0)
    df['FEAT_EMI4'],_=pd.factorize(pd.cut(df['EMI'].fillna(-99999),[-99999,0,100,300,600,1200,2400,3600,10000000]))
    df['FEAT_LOAN1'],_=pd.factorize(df['Loan_Amount'].isnull())
    df['FEAT_LOAN2'],_=pd.factorize(pd.cut(df['Loan_Amount'].fillna(-1),[-100,-1,0,5000,10000,20000,30000,50000,100000,10000000]))
    df['CONT_FEAT_LOAN1']=np.clip(df['Loan_Amount'].fillna(-99999),-99999,100000)
    df['CONT_FEAT_LOAN2']=df[['Loan_Amount','Loan_Period','Existing_EMI']].apply(lambda x: x[0]/(x[1]*x[2]) if x[1]>0 and x[2]>0 else -99999,axis=1).fillna(-99999)
    df['CONT_FEAT_LOAN3']=df['Interest_Rate'].fillna(-99999)
    df['CONT_FEAT_EMI1']=df['Existing_EMI'].fillna(-99999)
    df['CONT_FEAT_EMI2']=df['EMI'].fillna(-99999)
    df['FEAT_LOAN3'],_=pd.factorize(df['Loan_Period'].fillna(-1))
    df['FEAT_LOAN4'],_=pd.factorize((df['Interest_Rate'].fillna(-10)/5).astype(np.int64))
    df['FEAT_VAR1'],_=pd.factorize(df['Var1'])
    for col in [x for x in df.columns if x[:4] in ('FEAT')]:
        a=df[col].value_counts()
        df[col.replace('FEAT','COUNT')]=df[col].map(a)
        if a.min()<=100:
            a=a[a>100]
            a1=df[col].astype('category')
            a2=df[col].map(a).fillna(-99999).astype('category')
            df[col]=a1
            df[col+'_CLEAN']=a2
        else:
            df[col]=df[col].astype('category')
    return df[[x for x in df.columns if x[:4] in ('FEAT','Appr','CONT','COUN')]]
Example #50
0
        text_data_file = open(file_name, "rb")
        category_text_data = pickle.load(text_data_file)

        #print("Control point 4")
        #print(process.memory_info()[0])

        category_merged_df = pd.merge(merged_df,
                                      category_text_data,
                                      how="inner",
                                      on=["id"])

        #print("Control point 5")
        #print(process.memory_info()[0])

        category_merged_df['teacher_prefix_code'] = pd.factorize(
            category_merged_df['teacher_prefix'])[0]
        category_merged_df['project_grade_category_code'] = pd.factorize(
            category_merged_df['project_grade_category'])[0]
        category_merged_df['school_state_code'] = pd.factorize(
            category_merged_df['school_state'])[0]

        #print("Control point 6")
        #print(process.memory_info()[0])

        columns_list = [
            "teacher_prefix_code", "project_grade_category_code", "month",
            "quarter", "teacher_number_of_previously_posted_projects",
            "total_price", "school_state_code"
        ]

        if True:  #include_text_data:
Example #51
0
 

y= data['No_Val_Available_8'].values.T

#y= data['No_Val_Available_8'].values.reshape([1,-1]).T

 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, train_size = .8, shuffle=True, random_state =1)

 

plt.scatter( X_train['DOM:'], X_train['Original:'] , c=pd.factorize(y_train)[0], cmap= plt.cm.coolwarm   )

plt.title("Test data community values vs price")

plt.show()


 

 

# Import a support vector classifier

from sklearn.svm import LinearSVC

# Instantiate this model
Example #52
0
            c_select = select.copy()
            c_select[j] = i
            X = x[c_select].values
            Y = circleradviz(X, anchor)
            score = metrics.silhouette_score(Y, y, metric='sqeuclidean')
            if (best_score < score):
                best_score = score
                best_select = c_select
    return (best_score, best_select)


classes = np.unique(y)
colors = matplotlib.pyplot.cm.rainbow(np.linspace(0, 1, len(classes)))
cm = matplotlib.colors.ListedColormap(colors)
col = x.columns.values
cy = pd.factorize(y)[0]

#number of genes
m = 6
(score, select) = GeneSelection(m, x, y)

X = x[select].values
anchor = DimensionAnchor(m)

Y = circleradviz(X, anchor)
best_score = metrics.silhouette_score(Y, y, metric='sqeuclidean')
print('Score: ', best_score)
plt.figure(figsize=(6, 6))
cy = pd.factorize(y)[0]

t = np.linspace(0, 2 * np.pi, 100)
Example #53
0
length = []


# load a wave data
def load_wave_data(audio_dir, file_name):
    file_path = os.path.join(audio_dir, file_name)
    # x, fs = librosa.load(file_path, sr=16000)
    wf = wave.open(audio_dir + "/" + file_name + ".wav", "r")
    length.append(float(wf.getnframes()) / wf.getframerate())
    r = float(wf.getnframes()) / wf.getframerate()
    return r


meta_data = pd.read_table("sample_submit.tsv")
labels, uniques = pd.factorize(meta_data['target'])
meta_data['target'] = labels
print(meta_data)

data_size = meta_data.shape
# arrange target label and its name
class_dict = meta_data["target"].unique()
# print(class_dict)

# get training dataset and target dataset
x = list(meta_data.loc[:, "fileName"])
y = list(meta_data.loc[:, "target"])

for i in range(len(y)):
    len = load_wave_data("test", x[i])
    with open("testLength.csv", "w", newline="") as f:
Example #54
0
def prepare_data(path, config):

    thresh = config.get('Evaluation', 'FilterThresh')
    data = pd.read_csv(path + '/abundance.tsv',
                       index_col=0,
                       sep='\t',
                       header=None)
    labels = np.genfromtxt(path + '/labels.txt', dtype=np.str_, delimiter=',')
    core_filt_thresh = float(thresh)
    opp_filt_thresh = 0.0

    data = data.transpose()

    sums = data.sum(axis=1)
    data = data.divide(sums, axis=0)
    labels, label_set = pd.factorize(labels)

    pos_set = data.iloc[np.where(labels == 1)]
    neg_set = data.iloc[np.where(labels == 0)]

    core = filter_data(data, labels, core_filt_thresh, opp_filt_thresh)

    data = core

    features = list(data.columns.values)
    print("There are %d raw features..." % (len(features)))
    features_df = get_feature_df(features)

    print("Building tree structure...")
    try:
        g = pickle.load(
            open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl",
                 'rb'))
        print("Found tree file...")
    except:
        print("Tree file not found...")
        print("Contsructing tree..")
        g = Graph()
        g.build_graph()
        g.prune_graph(features_df)
        pickle.dump(
            g,
            open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl",
                 'wb'))

    print("Populating trees...")
    results = Parallel(n_jobs=num_cores)(
        delayed(generate_maps)(x, g, features_df) for x in data.values)
    my_maps = np.array(np.take(results, 1, 1).tolist())
    counts = np.count_nonzero(my_maps, axis=0)

    my_benchmark = np.array(np.take(results, 0, 1).tolist())
    my_benchmark_tree = np.array(np.take(results, 2, 1).tolist())

    tree_features = g.graph_vector_features()

    my_benchmark_df = pd.DataFrame(index=tree_features,
                                   data=np.transpose(my_benchmark_tree))
    my_benchmark_df = my_benchmark_df.groupby(my_benchmark_df.index).mean()

    tree_features = my_benchmark_df.index
    my_benchmark_tree = np.transpose(my_benchmark_df.values)

    num_tree_features = len(tree_features)
    print("There are %d tree features..." % (num_tree_features))
    return my_maps, my_benchmark, my_benchmark_tree, features, tree_features, labels, label_set, g, features_df
    
    data = [trace2,trace1]
    fig = go.Figure(data=data,layout=layout)
    py.iplot(fig)

# Plot
plot_radar(dat_rad,1,"Churn -  Customers")
plot_radar(dat_rad,0,"Non Churn - Customers")




###### Correlation matrix 
plt.figure(figsize=(12, 6))
telcom.drop(['customerID'],axis=1, inplace=True)
corr = telcom.apply(lambda x: pd.factorize(x)[0]).corr()
ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, 
                 linewidths=.2, cmap="YlGnBu")


# Other Correlation matrix 
correlation = telcom.corr()
matrix_cols = correlation.columns.tolist() #tick labels
corr_array  = np.array(correlation) #convert to array

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
Example #56
0
import pandas as pd
import numpy as np

data = pd.read_csv("final.csv",encoding = "ISO-8859-1")

data.head()

data=data.drop(['Id',	'Company','Educational details',	'jobdescription',	'jobid',	'numberofpositions',	'payrate'], axis=1)

feature_cols = ['Education', 'Experience', 'jobtitle', 'loc_1']
X = data[feature_cols] 
label = ['industry']
y = data[label]

data['loc_1']=pd.factorize(data.loc_1)[0]
data['industry']=pd.factorize(data.industry)[0]
data['Education']=pd.factorize(data.Education)[0]
data['jobtitle']=pd.factorize(data.jobtitle)[0]

data.head()

X = data.iloc[:, 0:4].values
y = data.iloc[:, 4].values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
Example #57
0
def hash_array(vals, encoding="utf8", hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    Parameters
    ----------
    vals : ndarray, Categorical
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals
    """

    if not hasattr(vals, "dtype"):
        raise TypeError("must pass a ndarray-like")
    dtype = vals.dtype

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke).
    if is_categorical_dtype(dtype):
        return _hash_categorical(vals, encoding, hash_key)
    elif is_extension_array_dtype(dtype):
        vals, _ = vals._values_for_factorize()
        dtype = vals.dtype

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(dtype, np.complex128):
        return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    elif isinstance(dtype, np.bool):
        vals = vals.astype("u8")
    elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
        vals = vals.view("i8").astype("u8", copy=False)
    elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
        vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8")
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            from pandas import factorize, Categorical, Index

            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes,
                              Index(categories),
                              ordered=False,
                              fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        try:
            vals = hashing.hash_object_array(vals, hash_key, encoding)
        except TypeError:
            # we have mixed types
            vals = hashing.hash_object_array(
                vals.astype(str).astype(object), hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xBF58476D1CE4E5B9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94D049BB133111EB)
    vals ^= vals >> 31
    return vals
Example #58
0
def MICAProcess(data, state_column, current_column, max_states, batch_size,
                window_size, sampleRate, stateDict, expected_input_shape,
                expected_output_shape):
    """
        Function that turns data into a minmax scaled, batched dataset ready for input into a deep learning model, with
        options for train test splitting. Also includes "max state consideration" - adds a number of "empty" states to
        one hot encoding to allow for consistent model input size across different state models
    """

    # One hot encoding for state column
    # Temp fix for missing states

    num_states = max_states
    # AAAHHH HORRIBLE CODE!!!!!

    #    if 'Viterbi' in data.columns:
    #        if type(data[state_column][0]) == type('string'):
    #            temp_data = pd.DataFrame([[0, list(stateDict.keys())[i], 0, list(stateDict.keys())[i], 0, 0] for i in range(max_states)], columns=data.columns)
    #        else:
    #            # Holy HELL this is a bad idea.
    #            temp_data = pd.DataFrame([[0, i, 0, i, 0, 0] for i in range(max_states)], columns=data.columns)
    #    else:
    #        temp_data = pd.DataFrame([[0,i,0,0,0] for i in range(max_states)], columns=data.columns)
    #
    #   data = data.append(temp_data)

    states, _ = pd.factorize(data[state_column], sort=True)
    data[state_column] = states
    cat_labels = pd.get_dummies(data[state_column], prefix="state_")
    """
    # Max state consideration - add 0 rows to make up for max states
    for missing_state in range(num_states, max_states):
        cat_labels["state" + str(missing_state)] = 0
    """

    # Replace labels column with new cat_labels dataframe
    data.drop([state_column], axis=1, inplace=True)

    xdata = data
    ydata = cat_labels
    # Truncate data for batching
    dataSize = len(xdata) // (batch_size * window_size) * (batch_size *
                                                           window_size)
    xdata, ydata = xdata[
        current_column][:dataSize].values, ydata[:dataSize].values

    # Minmax scaling
    minmax = MinMaxScaler()
    xdata = minmax.fit_transform(xdata.reshape(-1, 1))

    # Reshaping dat afor use in batched input
    xdata = xdata.reshape(-1, batch_size, *expected_input_shape)
    ydata = ydata.reshape(-1, batch_size, *expected_output_shape)

    return PPData(x_data=xdata,
                  y_data=ydata,
                  scaler=minmax,
                  sampleRate=sampleRate,
                  batch_size=batch_size,
                  window_size=window_size,
                  max_states=max_states,
                  stateDict=stateDict)
	if args.data == None:
		print("Please specify raw or latent for data flag")
	else:
		dataset=args.data
		svm_accuracy = []
		svm_roc_auc = []
		svm_precision = []
		svm_recall = []
		svm_f_score = []
		svm_pred = []
		svm_prob = []
		svm_mcc = []
		
		fp = pd.read_csv("diabimmune_metadata_allcountries_allergy_noQuotes.csv", index_col=3)
		allergy = fp["allergy"]
		allergy = pd.factorize(allergy)
		subject = fp["subjectID"]

		labels = allergy[1]
		allergy = allergy[0]

		subject_data = {'ID': subject, 'label': allergy}
		split_df = pd.DataFrame(data=subject_data).groupby("ID").median()

		split_sub = split_df.index.values
		split_lab = np.array(split_df[["label"]].as_matrix()).reshape(-1)
		
		print(len(split_sub))
		print(len(split_lab))
		
		if dataset == "latent":
Example #60
0
def main():
    train = read_train_data(path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train/'))
    test = read_test_data(path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test/'))
    if adoption_shuffle:
        train['AdoptionSpeed'] = random.sample(train['AdoptionSpeed'].values.tolist(), len(train))
    if densenet_predict:
        dnet_model = densenet_model(weight_path=os.path.join(input.__path__[0], 'densenet-keras/DenseNet-BC-121-32-no-top.h5'))
        train_feats = predict_using_img(dnet_model,
                                        train,
                                        img_path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_images/'))
        test_feats = predict_using_img(dnet_model,
                                       test,
                                       img_path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_images/'))
        train_feats.to_pickle('densenet_train_predict.pkl')
        test_feats.to_pickle('densenet_test_predict.pkl')
    else:
        with open('./densenet_train_predict.pkl', 'rb') as f:
            train_feats = pickle.load(f)
        with open('./densenet_test_predict.pkl', 'rb') as f:
            test_feats = pickle.load(f)

    all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']]

    svd_col = adopt_svd(train_feats, test_feats)

    img_features = pd.concat([all_ids, svd_col], axis=1)

    labels_breed = pd.read_csv(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/breed_labels.csv'))
    labels_color = pd.read_csv(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/color_labels.csv'))
    labels_state = pd.read_csv(os.path.join(input.__path__[0], 'my_state_labels/my_state_labels.csv'))

    train_image_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_images/*.jpg')))
    train_metadata_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_metadata/*.json')))
    train_sentiment_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_sentiment/*.json')))

    test_image_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_images/*.jpg')))
    test_metadata_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_metadata/*.json')))
    test_sentiment_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_sentiment/*.json')))

    # Metadata:
    train_df_metadata = pd.DataFrame(train_metadata_files)
    train_df_metadata.columns = ['metadata_filename']
    train_df_sentiment = pd.DataFrame(train_sentiment_files)
    train_df_sentiment.columns = ['sentiment_filename']
    # Metadata:
    test_df_metadata = pd.DataFrame(test_metadata_files)
    test_df_metadata.columns = ['metadata_filename']
    test_df_sentiment = pd.DataFrame(test_sentiment_files)
    test_df_sentiment.columns = ['sentiment_filename']


    train_pet_ids = train.PetID.unique()
    test_pet_ids = test.PetID.unique()

    if exe_extract_additional_feature:
        dfs_train = Parallel(n_jobs=12, verbose=1)(
            delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids)
        dfs_test = Parallel(n_jobs=12, verbose=1)(
            delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids)
        train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)]
        train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)]
        train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False)
        train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False)
        test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)]
        test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)]
        test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False)
        test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False)
        train_dfs_metadata.to_pickle('train_dfs_metadata.pkl')
        train_dfs_sentiment.to_pickle('train_dfs_sentiment.pkl')
        test_dfs_metadata.to_pickle('test_dfs_metadata.pkl')
        test_dfs_sentiment.to_pickle('test_dfs_sentiment.pkl')

    else:
        with open('./train_dfs_metadata.pkl', 'rb') as f:
            train_dfs_metadata = pickle.load(f)
        with open('./train_dfs_sentiment.pkl', 'rb') as f:
            train_dfs_sentiment = pickle.load(f)
        with open('./test_dfs_metadata.pkl', 'rb') as f:
            test_dfs_metadata = pickle.load(f)
        with open('./test_dfs_sentiment.pkl', 'rb') as f:
            test_dfs_sentiment = pickle.load(f)



    # ### group extracted features by PetID:
    train_proc = agg_feature(train, train_dfs_metadata, train_dfs_sentiment)
    test_proc = agg_feature(test, test_dfs_metadata, test_dfs_sentiment)
    train_proc = merge_labels_breed(train_proc, labels_breed)
    test_proc = merge_labels_breed(test_proc, labels_breed)
    train_proc, test_proc = merge_labels_state(train_proc, test_proc, labels_state)
    train_proc = fill_and_drop_feature(train_proc)
    test_proc = fill_and_drop_feature(test_proc)
    train_proc = add_feature(train_proc)
    test_proc = add_feature(test_proc)

    X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False)
    X_temp = X.copy()
    text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities']
    categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName']
    to_drop_columns = ['PetID', 'Name', 'RescuerID']

    rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index()
    rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT']

    X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID')

    for i in categorical_columns:
        try:
            X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0]
        except:
            pass

    X_text = X_temp[text_columns]
    for i in X_text.columns:
        X_text.loc[:, i] = X_text.loc[:, i].fillna('none')

    X_temp['Length_Description'] = X_text['Description'].map(len)
    X_temp['Length_metadata_annots_top_desc'] = X_text['metadata_annots_top_desc'].map(len)
    X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(len)
    X_temp = parse_tfidf(X_temp, X_text)

    X_temp = X_temp.merge(img_features, how='left', on='PetID')


    agg_train_imgs = agg_img_feature(train_image_files)
    agg_test_imgs = agg_img_feature(test_image_files)
    agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True)
    X_temp = X_temp.merge(agg_imgs, how='left', on='PetID')

    # ### Drop ID, name and rescuerID
    X_temp = X_temp.drop(to_drop_columns, axis=1)

    X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :]
    X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :]

    X_test = X_test.drop(['AdoptionSpeed'], axis=1)
    assert X_train.shape[0] == train.shape[0]
    assert X_test.shape[0] == test.shape[0]
    train_cols = X_train.columns.tolist()
    train_cols.remove('AdoptionSpeed')

    test_cols = X_test.columns.tolist()

    assert np.all(train_cols == test_cols)

    X_train_non_null = X_train.fillna(-1)
    X_test_non_null = X_test.fillna(-1)
    X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any()

    xgb_params = {
        'eval_metric': 'rmse',
        'object':'reg:squarederror',
        'seed': 1337,
        'eta': 0.0123,
        'subsample': 0.8,
        'colsample_bytree': 0.85,
        'tree_method': 'gpu_hist',
        'device': 'gpu',
        'silent': 1,
    }
    X_train_non_null = fill_and_drop_feature_end(X_train_non_null)
    X_test_non_null = fill_and_drop_feature_end(X_test_non_null)

    X_train_non_null.to_csv('./X_train.csv')

    model, oof_train, oof_test, feature_score = run_xgb(xgb_params, X_train_non_null, X_test_non_null)

    optR = OptimizedRounder()
    optR.fit(oof_train, X_train['AdoptionSpeed'].values)
    coefficients = optR.coefficients()
    valid_pred = optR.predict(oof_train, coefficients)
    qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred)
    print("QWK = ", qwk)

    coefficients_ = coefficients.copy()
    coefficients_[0] = 1.66
    coefficients_[1] = 2.13
    coefficients_[3] = 2.85
    train_predictions = optR.predict(oof_train, coefficients_).astype(np.int8)
    test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_).astype(np.int8)

    valid_pred = optR.predict(oof_train, coefficients_)
    qwk_change = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred)
    print("QWK_change = ", qwk_change)

    submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions})
    submission.to_csv('submission.csv', index=False)
    str_metric_score = 'qwk' + '_0' + str(int(qwk * 100000))
    storage_process(submission, str_metric_score, qwk, qwk_change, feature_score)