def learn_dtree(data, csvfile): clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4) k = data.groupby(['operator']) # k = data.groupby(['operator']) f = k['isCovered'].agg({'mean_kill': np.mean, 'number of mutants': len, 'number of killed':np.sum}) f.to_csv(csvfile) fig = plt.figure() # ax = Axes3D(fig) # ax = fig.add_subplot(111, projection='3d') plt.scatter(standardize (f['mean']),f['sum']) plt.ylabel('mutant_size') plt.xlabel('expected_kill (standatdize)') # print f[f['len'] > 25000] # ax.set_xlabel('mean') # ax.set_ylabel('len') # ax.set_zlabel('sum') plt.show() # plt.show() # for m in k.groups: # print m,len(k.groups[m]), data['op'] = pd.factorize(data['operator'])[0] data['m'] = pd.factorize(data['method'])[0] HLdata['c'] = pd.factorize(data['class'])[0] # plt.show() plt.close() x = data[['op', 'c', 'testId']].values y = data['isCovered'].values clf.fit(x,y) dot_data = StringIO.StringIO() tree.export_graphviz(clf, out_file=dot_data) return dot_data.getvalue()
def main(stack_setting_): """ [rawdata2filterdata Step] 1. Reading raw datasets 2. Droping useless feat columns in training set 3. Droping useless feat columns in test set """ raw_train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['train']) raw_test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['raw']['test']) print("= Reading raw datasets ...") names = ("age, workclass, fnlwgt, education, education-num, marital-status, occupation, relationship, race, sex, capital-gain, capital-loss, hours-per-week, native-country, TARGET").split(', ') raw_train = pd.read_csv(raw_train_path, names=names, skiprows=1)#, index_col=0, sep=',' raw_train['TARGET'] = (raw_train['TARGET'].values == ' >50K').astype(np.int32) raw_train = raw_train.apply(lambda x: pd.factorize(x)[0]) train_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['train']) raw_train.to_csv(train_path, index=True, index_label='ID') raw_test = pd.read_csv(raw_test_path, names=names, skiprows=1)#, index_col=0, sep=',' raw_test['TARGET'] = (raw_test['TARGET'].values == ' >50K').astype(np.int32) raw_test = raw_test.apply(lambda x: pd.factorize(x)[0]) test_path = os.path.join(Config.get_string('data.path'), stack_setting_['0-Level']['folder'], stack_setting_['0-Level']['test']) raw_test.to_csv(test_path, index=True, index_label='ID')
def fact(df): pge = pd.factorize(df['page']) time = pd.factorize(df['Time']) df['pageFCT'] = pge[0] df['TimeFCT'] = time[0] return df
def buildModel(df): train_y = df['arr_del15'][:train_len] train_x = df[cols][:train_len] # transform categorical features train_x['unique_carrier'] = pd.factorize(train_x['unique_carrier'])[0] train_x['dep_conditions'] = pd.factorize(train_x['dep_conditions'])[0] train_x['arr_conditions'] = pd.factorize(train_x['arr_conditions'])[0] pd.set_option('display.max_rows', 500) print(train_x) # train_x['origin'] = pd.factorize(train_x['origin'])[0] # train_x['dest'] = pd.factorize(train_x['dest'])[0] # print(train_x) train_x = enc.fit_transform(train_x) print(train_x.shape) # Create Random Forest classifier with 50 trees clf_rf = RandomForestClassifier(n_estimators=50, n_jobs=-1) clf_rf.fit(train_x.toarray(), train_y) del train_x, train_y print("Model built") return clf_rf
def model_data(data, LECAT=False, NAMEAN=False, NA999=False, OH=False, ONLYCONT=False, ONLYCAT=False, ONLYCATOH=False, COLSREMOVAL=False, cols=[], maxCategories=300): data = data.copy() cat_var = list(data.select_dtypes(["object"]).columns) cont_var = list(data.select_dtypes(["float", "int"]).columns) if COLSREMOVAL: data = data.drop(cols, 1, inplace=False) cat_var = list(data.select_dtypes(["object"]).columns) cont_var = list(data.select_dtypes(["float", "int"]).columns) if NAMEAN: for col in cont_var: data.loc[data[col].isnull(), col] = data[col].mean() if NA999: for col in cont_var: data.loc[data[col].isnull(), col] = -999 if LECAT: for col in data[cat_var]: data[col] = pd.factorize(data[col])[0] if OH: cols2dummy = [col for col in data[cat_var] if len(data[col].unique()) <= maxCategories] colsNot2dummy = [col for col in data[cat_var] if len(data[col].unique()) > maxCategories] data = pd.get_dummies(data, dummy_na=True, columns=cols2dummy) #binning for col in colsNot2dummy: data[col] = pd.factorize(data[col])[0] dcb = DummycolumnsBins(cols=col, prefix=col, nb_bins=2000) dcb.fit(data) pd_binned = dcb.transform(data) data = pd.concat([data,pd_binned],1) if ONLYCONT: data = data[cont_var] if ONLYCAT: test_idx = data['ID'] Y = data['target'] data = data[cat_var] data['ID'] = test_idx data['target'] = Y if ONLYCATOH: test_idx = data['ID'] Y = data['target'] cols = list(set(data.columns).difference(set(cont_var))) ; print(cols) data = data[cols] data['ID'] = test_idx data['target'] = Y return data
def featurize_videogames(filename): ''' Input: A cleaned version of the original dataset Output: A pickled version of a subset of the raw data including all features that will be trained on. ''' df = pd.read_pickle(filename) Spore = df[df['asin'] =='B000FKBCX4'] Spore['title_name'] = 'Spore' Sim_City = df[df['asin'] == 'B007VTVRFA'] Sim_City['title_name'] = 'SimCity' Diablo_3 = df[df['asin'] == 'B00178630A'] Diablo_3['title_name'] = 'Diablo_3' Starcraft_2 = df[df['asin'] == 'B000ZKA0J6'] Starcraft_2['title_name'] = 'Starcraft_2' Sid_Meiers_Civilization_V = df[df['asin'] == 'B0038TT8QM'] Sid_Meiers_Civilization_V['title_name'] = 'Sid_Meiers_Civilization_V' subset_data = pd.concat([Diablo_3,Spore,Starcraft_2,Sim_City,Sid_Meiers_Civilization_V]) corpus = subset_data['reviewText'] sentiment = map(text_stats.get_sentiment,corpus) subset_data['rating'] = subset_data['overall'] subset_data['percent_helpful'] = subset_data['helpful'].apply(lambda x: x[0] / float(x[1])) subset_data['percent_helpful'] = subset_data['percent_helpful'].apply(lambda x: round(x,2)) subset_data['title'] = pd.factorize(subset_data.title_name)[0] subset_data['review_year'] = subset_data['reviewTime'].apply(lambda x : x.split(',')[1]) subset_data['review_year'] = pd.factorize(subset_data.review_year)[0] subset_data['pros_cons'] = map(text_stats.pros_cons,corpus) subset_data['punctuation_count'] = map(text_stats.count_punc,corpus) subset_data['word_count'] = corpus.apply(lambda x: len(x.split())) subset_data['num_char'] = map(text_stats.count_chars,corpus) subset_data['upper_case_count'] = map(text_stats.count_upper,corpus) subset_data['url_count'] = map(text_stats.count_urls,corpus) # Grabs only the first item of the list returned by get_sentiment which is polarity subset_data['polarity'] = [n[0] for n in sentiment] # Grabs only the second item of the list returned by get_sentiment which is subjectivity subset_data['subjectivity'] = [n[1] for n in sentiment] # Saves subsetted data to pickle with open('subset_'+filename.split('.')[0].split('_' , 1)[1]+'.pkl', 'w') as f: pickle.dump(subset_data, f)
def process_data(X, y): X = X.drop(41, 1) X[1], uniques = pandas.factorize(X[1]) X[2], uniques = pandas.factorize(X[2]) X[3], uniques = pandas.factorize(X[3]) num_examples = 10**6 X = X[0:num_examples] y = y[0:num_examples] X = numpy.array(X) y = numpy.array(y).ravel() return X, y
def Load_data(): train = pd.read_csv(path_train) test = pd.read_csv(path_test) # combine train and test data_comb = train.append(test) # Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code # create any new variables data_comb['Product_Info_2_char'] = data_comb.Product_Info_2.str[0] data_comb['Product_Info_2_num'] = data_comb.Product_Info_2.str[1] # factorize categorical variables data_comb['Product_Info_2'] = pd.factorize(data_comb['Product_Info_2'])[0] data_comb['Product_Info_2_char'] = pd.factorize(data_comb['Product_Info_2_char'])[0] data_comb['Product_Info_2_num'] = pd.factorize(data_comb['Product_Info_2_num'])[0] data_comb['BMI_Age'] = data_comb['BMI'] * data_comb['Ins_Age'] med_keyword_columns = data_comb.columns[data_comb.columns.str.startswith('Medical_Keyword_')] data_comb['Med_Keywords_Count'] = data_comb[med_keyword_columns].sum(axis=1) print('Encode missing values') data_comb.fillna(-1, inplace=True) # fix the dtype on the label column data_comb['Response'] = data_comb['Response'].astype(int) # split train and test train = data_comb[data_comb['Response']>0].copy() test = data_comb[data_comb['Response']<1].copy() target = train['Response'].values le = preprocessing.LabelEncoder() y = le.fit_transform(target) train.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True) test.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True) train = train.as_matrix() test = test.as_matrix() print('Construct labels for bumping') num_class = len(np.unique(target)) labels = np.zeros(shape=(train.shape[0],num_class-1)) labels[:, 0][target==1]=1 labels[:, 6][target<8]=1 for i in range(1, num_class-2): labels[:, i][target<i+2]=1 return train, test, target, labels
def processCabin(): """ Generate features from the Cabin variable Cabin numbers, when present, contain a single (or space-delimited list) cabin number that is composed of a letter and number with no space or other character between. This is a sparse variable: < 30% is populated """ global df # Replace missing values with "U0" df['Cabin'][df.Cabin.isnull()] = 'U0' # create feature for the alphabetical part of the cabin number df['CabinLetter'] = df['Cabin'].map( lambda x : getCabinLetter(x)) df['CabinLetter'] = pd.factorize(df['CabinLetter'])[0] # create binary features for each cabin letters if keep_binary: cletters = pd.get_dummies(df['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x)) df = pd.concat([df, cletters], axis=1) # create feature for the numerical part of the cabin number df['CabinNumber'] = df['Cabin'].map( lambda x : getCabinNumber(x)).astype(int) + 1 # scale the number to process as a continuous feature if keep_scaled: scaler = preprocessing.StandardScaler() df['CabinNumber_scaled'] = scaler.fit_transform(df['CabinNumber'])
def dataTransform(filepath): import pandas as pd import numpy as np from sklearn.preprocessing import MinMaxScaler numeric_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses'] all[ numeric_cols ] = all[ numeric_cols ].astype(float) all[ numeric_cols ]=all[ numeric_cols ].apply(lambda x: MinMaxScaler().fit_transform(x)) x_num_all = all[ numeric_cols ].as_matrix() # categorical x_cat_all = all.drop( numeric_cols + [ 'readmitted'], axis = 1 ) fac_x_cat_all = pd.DataFrame() cat_cols = list(x_cat_all.columns.values) for col in cat_cols: all_cur, _ = pd.factorize(cat_all[col]) fac_x_cat_all[col] = all_cur fac_x_cat_all = fac_x_cat_all.as_matrix() x_all = np.hstack(( x_num_all, fac_x_cat_all)) y_all = all.readmitted return x_all, y_all
def processName(): global df # how many different names do they have? df['Names'] = df['Name'].map(lambda x: len(re.split(' ', x))) # what is each person's title? df['Title'] = df['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0]) # group low-occuring, related titles together df['Title'][df.Title == 'Jonkheer'] = 'Master' df['Title'][df.Title.isin(['Ms','Mlle'])] = 'Miss' df['Title'][df.Title == 'Mme'] = 'Mrs' df['Title'][df.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir' df['Title'][df.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady' # Build binary features if keep_binary: df = pd.concat([df, pd.get_dummies(df['Title']).rename(columns=lambda x: 'Title_' + str(x))], axis=1) # process scaling if keep_scaled: scaler = preprocessing.StandardScaler() df['Names_scaled'] = scaler.fit_transform(df['Names']) if keep_bins: df['Title_id'] = pd.factorize(df['Title'])[0]+1 if keep_bins and keep_scaled: scaler = preprocessing.StandardScaler() df['Title_id_scaled'] = scaler.fit_transform(df['Title_id'])
def processAge(): global df setMissingAges() # center the mean and scale to unit variance if keep_scaled: scaler = preprocessing.StandardScaler() df['Age_scaled'] = scaler.fit_transform(df['Age']) # have a feature for children df['isChild'] = np.where(df.Age < 13, 1, 0) # bin into quartiles and create binary features df['Age_bin'] = pd.qcut(df['Age'], 4) if keep_binary: df = pd.concat([df, pd.get_dummies(df['Age_bin']).rename(columns=lambda x: 'Age_' + str(x))], axis=1) if keep_bins: df['Age_bin_id'] = pd.factorize(df['Age_bin'])[0]+1 if keep_bins and keep_scaled: scaler = preprocessing.StandardScaler() df['Age_bin_id_scaled'] = scaler.fit_transform(df['Age_bin_id']) if not keep_strings: df.drop('Age_bin', axis=1, inplace=True)
def polyFeatures(X, X_test, polyOrder=2, verbose=True): ''' Given a set of matrices, we shall add a number of features, dependent upon ''' X_all = np.vstack((X, X_test)) def colStack( cols ): strs = map( lambda m: '-'.join(map(str, m)), zip( *(X_all[:,c] for c in cols)) ) return strs if verbose: print 'polynomial order: ', polyOrder orders = map(list, combinations(range(np.shape(X)[1]), polyOrder)) N = len(orders) if verbose: print 'Number of orders: ', N allLists = [] for i, cols in enumerate(orders): print i+1, 'of', N, cols vals = colStack( cols ) newCol, _ = pd.factorize( vals ) allLists.append(newCol) allLists = np.array(allLists) return allLists.T
def processFare(): global df # replace missing values as the median fare. Currently the datasets only contain one missing Fare value df['Fare'][ np.isnan(df['Fare']) ] = df['Fare'].median() # zero values cause problems with our division interaction variables so set to 1/10th of the lowest fare df['Fare'][ np.where(df['Fare']==0)[0] ] = df['Fare'][ df['Fare'].nonzero()[0] ].min() / 10 # bin into quintiles for binary features df['Fare_bin'] = pd.qcut(df['Fare'], 4) if keep_binary: df = pd.concat([df, pd.get_dummies(df['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))], axis=1) if keep_bins: df['Fare_bin_id'] = pd.factorize(df['Fare_bin'])[0]+1 # center and scale the fare to use as a continuous variable if keep_scaled: scaler = preprocessing.StandardScaler() df['Fare_scaled'] = scaler.fit_transform(df['Fare']) if keep_bins and keep_scaled: scaler = preprocessing.StandardScaler() df['Fare_bin_id_scaled'] = scaler.fit_transform(df['Fare_bin_id']) if not keep_strings: df.drop('Fare_bin', axis=1, inplace=True)
def cleanData(train, test): target = train['target'] toDrop = ['v22', 'v112', 'v125', 'v74', 'v1', 'v110', 'v47'] print 'Drop features:', toDrop trainDrop = ['ID', 'target'] trainDrop.extend(toDrop) testDrop = ['ID'] testDrop.extend(toDrop) train = train.drop(trainDrop, axis=1) test = test.drop(testDrop, axis=1) # test = test.drop(['ID','v22'], axis=1) for (train_name, train_series), (test_name, test_series) in zip(train.iteritems(),test.iteritems()): # Iterator over (column name, Series) pairs if train_series.dtype == 'O': #for objects: factorize: to convert Object/String/Category to 0-based int value (index is -1 if None!!) #The pandas factorize function assigns each unique value in a series to a sequential, 0-based index, and calculates which index each series entry belongs to. train[train_name], tmp_indexer = pd.factorize(train[train_name]) test[test_name] = tmp_indexer.get_indexer(test[test_name]) else: #for int or float: fill NaN tmp_len = len(train[train_series.isnull()]) if tmp_len>0: train.loc[train_series.isnull(), train_name] = train_series.median() #train_series.mean() # tmp_len = len(test[test_series.isnull()]) if tmp_len>0: test.loc[test_series.isnull(), test_name] = train_series.median() #train_series.mean() # return train, target, test
def unique_value_groups(ar, sort=True): """Group an array by its unique values. Parameters ---------- ar : array-like Input array. This will be flattened if it is not already 1-D. sort : boolean, optional Whether or not to sort unique values. Returns ------- values : np.ndarray Sorted, unique values as returned by `np.unique`. indices : list of lists of int Each element provides the integer indices in `ar` with values given by the corresponding value in `unique_values`. """ inverse, values = pd.factorize(ar, sort=sort) groups = [[] for _ in range(len(values))] for n, g in enumerate(inverse): if g >= 0: # pandas uses -1 to mark NaN, but doesn't include them in values groups[g].append(n) return values, groups
def setMissingAges(df): # Grab all the features that can be included in a Random Forest Regressor age_df = df[['Age','Embarked','Fare', 'Parch', 'SibSp', 'Title','Pclass']] # Split into sets with known and unknown Age values knownAge = age_df.loc[ (df.Age.notnull()) ] unknownAge = age_df.loc[ (df.Age.isnull()) ] # All age values are stored in a target array y = knownAge.values[:, 0] # All the other values are stored in the feature array X = knownAge.values[:, 1::] # Create and fit a model rtr = RandomForestRegressor(n_estimators=20, n_jobs=-1) rtr.fit(X, y) # Use the fitted model to predict the missing values predictedAges = rtr.predict(unknownAge.values[:, 1::]) # Assign those predictions to the full data set df.loc[ (df.Age.isnull()), 'AgeFill' ] = predictedAges.astype(int) k = df['Age'].notnull() df['Age'][k] = pd.factorize(pd.cut(df[df['Age'].notnull()]['Age'],8))[0] df['Age'][~k] = df['Age'].max()+1 #all[u'Age_bin'] = pd.factorize(pd.qcut(all[u'Age'],8))[0] df['Fare'] = return df
def calc_MI_cate_feat_target(column, target, num_bins): vals, tmp_indexer = pd.factorize(column, na_sentinel=-1) p_neg = 0.238801 p_pos = 0.761199 max_cate = np.max(vals) densitys, bin_edges = np.histogram(vals, density=True) #print densitys #print 'start' final_mi = 0 for level in xrange(-1, max_cate+1): p_cate_pos = np.sum((vals == level) & (target == 1)) / float(column.shape[0]) p_cate_neg = np.sum((vals == level) & (target == 0)) / float(column.shape[0]) p_cate = np.sum((vals == level)) / float(column.shape[0]) if p_cate_pos == 0 or p_cate_neg == 0: continue final_mi += p_cate_pos * np.log2(p_cate_pos / (p_cate * p_pos)) final_mi += p_cate_neg * np.log2(p_cate_neg / (p_cate * p_neg)) #print '%d, %f' %(level, final_mi) return final_mi
def factorize_data(self, x, cols, in_place=False): """Replace column in cols with one-hot representation of cols Parameters ---------- x : np.ndarray Matrix with categorical data cols: tuple <int> Index of columns with categorical data Returns ------- d : np.ndarray Matrix with categorical data replaced with one-hot rows """ if in_place: data = x else: data = np.copy(x) factors_labels = {} for col in cols: factors, labels = pd.factorize(data[:,col]) factors_labels[col] = (factors_labels) data[:,col] = factors return data, factor_labels
def group_sums(x, group, use_bincount=True): """simple bincount version, again group : array, integer assumed to be consecutive integers no dtype checking because I want to raise in that case uses loop over columns of x for comparison, simple python loop """ x = np.asarray(x) if x.ndim == 1: x = x[:, None] elif x.ndim > 2 and use_bincount: raise ValueError('not implemented yet') if use_bincount: # re-label groups or bincount takes too much memory if np.max(group) > 2 * x.shape[0]: group = pd.factorize(group)[0] return np.array([np.bincount(group, weights=x[:, col]) for col in range(x.shape[1])]) else: uniques = np.unique(group) result = np.zeros([len(uniques)] + list(x.shape[1:])) for ii, cat in enumerate(uniques): result[ii] = x[g == cat].sum(0) return result
def processName(df,keep_binary=False,keep_bins=False,keep_scaled=False): """ Parameters: keep_binary:include 'Title_Mr' 'Title_Mrs'... keey_scaled&&keep_bins:include 'Names_scaled' 'Title_id_scaled' Note: the string feature 'Name' can be deleted """ # how many different names do they have? this feature 'Names' df['Names']=df['Name'].map(lambda x:len(re.split('\\(',x))) #what is each person's title? df['Title']=df['Name'].map(lambda x:re.compile(", (.*?)\.").findall(x)[0]) #group low-occuring,related titles together df['Title'][df.Title.isin(['Mr','Don','Major','Capt','Jonkheer','Rev','Col','Sir','Dona'])] = 'Mr' df['Title'][df.Title.isin(['Master'])] = 'Master' df['Title'][df.Title.isin(['Countess','Mme','Mrs','Lady','the Countess'])] = 'Mrs' df['Title'][df.Title.isin(['Mlle','Ms','Miss'])] = 'Miss' df['Title'][(df.Title.isin(['Dr']))&(df['Sex']=='male')]='Mr' df['Title'][(df.Title.isin(['Dr']))&(df['Sex']=='female')]='Mrs' df['Title'][df.Title.isnull()][df['Sex']=='male']='Master' df['Title'][df.Title.isnull()][df['Sex']=='female']='Miss' #build binary features if keep_binary: df=pd.concat([df,pd.get_dummies(df['Title']).rename(columns=lambda x:'Title_'+str(x))],axis=1) #process_scaled if keep_scaled: scaler=preprocessing.StandardScaler() df['Names_scaled']=scaler.fit_transform(df['Names']) if keep_bins: df['Title_id']=pd.factorize(df['Title'])[0]+1 if keep_bins and keep_scaled: scaler=preprocessing.StandardScaler() df['Title_id_scaled']=scaler.fit_transform(df['Title_id']) del df['Name'] return df
def test_factorize_empty(self, data): labels, uniques = pd.factorize(data[:0]) expected_labels = np.array([], dtype=np.intp) expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype) tm.assert_numpy_array_equal(labels, expected_labels) self.assert_extension_array_equal(uniques, expected_uniques)
def __init__(self, data_file): self.dataFrame = pd.read_csv(data_file) for k in self.dataFrame.columns[1:]: self.dataFrame[k], _ = pd.factorize(self.dataFrame[k]) self.classes = np.array(sorted(pd.Categorical(self.dataFrame['class']).categories)) self.features = self.dataFrame.columns[self.dataFrame.columns != 'class']
def extract_puid_feats(self, df_data): """ Extract product uid feature. """ df_feat = pd.DataFrame() df_feat['p_uid'] = pd.factorize(df_data['product_uid'])[0] saveit(df_feat, 'df_puid_feats')
def factorise(df, na_sentinel): df = df.copy() # Don't modify in place. for column_name in df.columns: factorised, uniques = pd.factorize( df[column_name], na_sentinel=na_sentinel) df[column_name] = factorised return df
def convert(data): number = preprocessing.LabelEncoder() for i in data.columns: if data[i].dtype == 'object': data[i] = pd.factorize(data[i])[0] data = data.fillna(-9999) return data
def joint_factorize(train, test, column_name): joint_factors = pd.factorize(list(train[column_name]) + list(test[column_name])) train_length = len(train[column_name]) range_train = range(train_length) train[column_name] = joint_factors[0][range_train] range_test = range(train_length,len(joint_factors[0])) test[column_name] = joint_factors[0][range_test]
def printDecisionTreeForBelow3(self): from sklearn.ensemble import RandomForestClassifier import pandas as pd import numpy as np from sklearn.cross_validation import train_test_split data = self.dataManager.loadData(["QueryName","TimeStamp","Sid","Aid","Country","IsFirst","Browser","Os","Continent"],transformFields=True) features = data.columns[1:4] result = data.groupby('Aid').apply( lambda group: (group.Sid.nunique()<=3) ) below = result[result == True] data['IsBelow'] = data['Aid'].isin(below.index) y, _ = pd.factorize(data['IsBelow']) X_train, X_test, y_train, y_test = train_test_split(data[features], y, test_size=0.33, random_state=42) from sklearn.metrics import accuracy_score from sklearn import svm #clf = svm.SVC(); clf = RandomForestClassifier(n_jobs=2) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) accuracy_score(y_test, y_pred) from sklearn import tree i_tree = 0 for tree_in_forest in clf.estimators_: with open('tree_' + str(i_tree) + '.dot', 'w') as my_file: my_file = tree.export_graphviz(tree_in_forest, out_file = my_file) i_tree = i_tree + 1
def runModel(X_train, Y_train, X_eval, var = "Ca", seed=42): #25% in Test Sample, 75% in train and cv X_train, Y_train, X_test, Y_test, X_train_location_labels = generateTrainTestSplit(X_train, Y_train, seed) labels = np.array(pd.factorize(X_train_location_labels)[0]) #n_folds = 10 n_folds = 10 labels2 = (labels * (n_folds+1)/labels.max()) labels2[-1]=n_folds #Model is Ridge Regression cv_iterator = LeaveOneLabelOut(labels2) #test = [0.00001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] #modelCV = RidgeCV(test, normalize=True, score_func="mse", scoring="mse", cv=cv_iterator ) #modelCV.fit(X_train, Y_train[var]) if var == "Ca": prediction, test_score = CaModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator) elif var == "P": prediction, test_score = PModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator) elif var == "pH": prediction, test_score = pHModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator) elif var == "SOC": prediction, test_score = SOCModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator) elif var == "Sand": prediction, test_score = SandModel(X_train, Y_train, X_test, Y_test, X_eval, cv_iterator) else: print "Error" return prediction, test_score
def processTicket(): """ Generate features from the Ticket variable """ global df df['TicketPrefix'] = df['Ticket'].map( lambda x : getTicketPrefix(x.upper())) df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('[\.?\/?]', '', x) ) df['TicketPrefix'] = df['TicketPrefix'].map( lambda x: re.sub('STON', 'SOTON', x) ) #print len(df['TicketPrefix'].unique()), "ticket codes:", np.sort(df['TicketPrefix'].unique()) df['TicketPrefixId'] = pd.factorize(df['TicketPrefix'])[0] # create binary features for each cabin letters if keep_binary: prefixes = pd.get_dummies(df['TicketPrefix']).rename(columns=lambda x: 'TicketPrefix_' + str(x)) df = pd.concat([df, prefixes], axis=1) df.drop(['TicketPrefix'], axis=1, inplace=True) df['TicketNumber'] = df['Ticket'].map( lambda x: getTicketNumber(x) ) df['TicketNumberDigits'] = df['TicketNumber'].map( lambda x: len(x) ).astype(np.int) df['TicketNumberStart'] = df['TicketNumber'].map( lambda x: x[0:1] ).astype(np.int) #print np.sort(df.TicketNumberStart.unique()) df['TicketNumber'] = df.TicketNumber.astype(np.int) #print np.sort(df['TicketNumber']) if keep_scaled: scaler = preprocessing.StandardScaler() df['TicketNumber_scaled'] = scaler.fit_transform(df['TicketNumber'])
#def data_plotter(data): if __name__ == '__main__': with open( 'C://Users//k_mathin//PycharmProjects//Masters//ciena_trials//Kamal//data//vodafone_data_clusters_filtered.pkl', 'rb') as f: data_set = pickle.load(f) data = [] for d in data_set['data']: data.append(d) data = np.asarray(data) #data = data[:,:15] print(data.shape[0]) label_data = np.asarray(data_set['osid']) labels, levels = pd.factorize(label_data) shelves = np.asarray(data_set['shelf']) cluster_num = levels.shape[0] print(cluster_num) clusters = kshape(zscore(data, axis=1), cluster_num) #clusters = kshape(data,cluster_num) y_pred = [] for i in range(0, data.shape[0]): for j in range(0, cluster_num): if i in clusters[j][1]: y_pred.append(j) continue conf = conf_mat(labels, y_pred) print(conf_mat(labels, y_pred)) print("done")
], ) # 平均値で補間(数値列のみ) # + # User_Scoreを数値列にする df["User_Score"] = df["User_Score"].astype("float") # User_Scoreを文字列にする df["Year_of_Release"] = df["Year_of_Release"].astype("str") # - # ラベルエンコディング cate_cols = df.select_dtypes( include=["object", "category", "bool"]).columns.to_list() for col in cate_cols: df[col], uni = pd.factorize(df[col]) # + train = df.iloc[:train.shape[0]] test = df.iloc[train.shape[0]:].reset_index(drop=True) # 目的変数 sales_cols = [ "NA_Sales", "EU_Sales", "JP_Sales", "Other_Sales", "Global_Sales", ] train_drop_sales = train.drop(sales_cols, axis=1)
2 58 dtype: int64 ''' categorize_map = {v: k for k, v in enumerate(cities.unique())} # encoding the object as an enumerated type (categorical variable). as_categories = cities.map(categorize_map) print(as_categories.apply(sys.getsizeof)) ''' 0 24 1 28 2 28 dtype: int64 ''' ''' notice immediately that memory usage is just about cut in half compared to when the full strings are used with object dtype. Note: I used sys.getsizeof() to show the memory occupied by each individual value in the Series. Keep in mind these are Python objects that have some overhead in the first place. (sys.getsizeof('') will return 49 bytes.) There is also colors.memory_usage(), which sums up the memory usage and relies on the .nbytes attribute of the underlying NumPy array. Don’t get too bogged down in these details: what is important is relative memory usage that results from type conversion, as you’ll see next. ''' # Another way to do this same thing is with Pandas’ pd.factorize(colors): # pandas.factorize encodes input values as an enumerated type or categorical variable print(pd.factorize(cities)[0]) # [0 1 2] # https://realpython.com/python-pandas-tricks/#reader-comments
dataset = df.copy() dataset.pop('visit_date') dataset.pop('min_visitors') dataset.pop('median_visitors') dataset.pop('max_visitors') dataset.pop('count_observations') # adding date to dataset s = pd.Series(date) df1 = pd.DataFrame({'date':s}) dataset=dataset.join(df1) #adding store id s = pd.Series(ids) labels, levels = pd.factorize(s) df1 = pd.DataFrame({'air_store_id':(labels)}) dataset.pop('air_store_id') dataset=dataset.join(df1) # In[6]: train_dataset = dataset.sample(frac=0.8,random_state=0) test_dataset = dataset.drop(train_dataset.index) # In[7]:
def load_data(path_train=DATA_TRAIN_PATH, path_test=DATA_TEST_PATH, path_brand_phne=BRAND_PATH, events_path=EVENTS_PATH, app_events_path=APP_EVENTS_PATH, app_labels_path=APP_LABELS_PATH, labels_category_path=LABELS_CATEGORY): # Phone brand brand = pd.read_csv(path_brand_phne) brand.drop_duplicates('device_id', keep='first', inplace=True) brand['phone_brand'] = pd.factorize(brand['phone_brand'], sort=True)[0] # add new feature : the number of occurence (popularity) of each phne brand dict_phone = dict(brand.phone_brand.value_counts()) brand['phone_brand_occurence'] = brand.phone_brand.apply(dict_phone.get) brand['device_model'] = pd.factorize(brand['device_model'], sort=True)[0] # add new feature : the number of occurence (popularity) of each phne brand dict_device = dict(brand.device_model.value_counts()) brand['device_model_occurence'] = brand.device_model.apply(dict_device.get) train_loader = pd.read_csv(path_train) train = train_loader.drop(['age', 'gender'], axis=1) train['group'] = pd.factorize(train['group'], sort=True)[0] train = pd.merge(train, brand, how='left', on='device_id', left_index=True) gc.collect() #installed app appevents = pd.read_csv(app_events_path, header=0, nrows=10000) events = pd.read_csv(events_path, header=0) appencoder = LabelEncoder().fit(appevents.app_id) appevents['app'] = appencoder.transform(appevents.app_id) instaled_app = pd.merge(appevents, events.loc[:, ['event_id', 'device_id']], how='left', on='event_id') #instaled_app.reset_index(inplace=True) gc.collect() #label data applabels = pd.read_csv(app_labels_path) labels_ctegory = pd.read_csv(labels_category_path) #factorize the labels category labels_ctegory.category = pd.factorize(labels_ctegory.category)[0] #merge the app labels and labels category applabels = pd.merge(applabels, labels_ctegory, on='label_id', how='left') gc.collect() #add new features: the number of occurance (popularity ) of each label id dict_label = dict(applabels.label_id.value_counts()) applabels['label-occurence'] = applabels.label_id.apply(dict_label.get) gc.collect() #select only those in application events applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())] #transforme the app_id with the encoder already defined applabels['app'] = appencoder.transform(applabels.app_id) # perform a one hot encoding for the label id labelencoder = LabelEncoder().fit(applabels.label_id) applabels['label'] = labelencoder.transform(applabels.label_id) gc.collect() #merge the installed_app and the applabels so by then to merge them with the train data label_features = pd.merge( instaled_app.loc[:, ['device_id', 'app']], applabels.loc[:, ['app', 'label', 'label-occurence', 'category']], on='app', how='left') #label_features.reset_index(inplace=True) # merge the train data with new set to mark For each device which apps it has installed #train = pd.merge(train , instaled_app , how = 'left',on='device_id') # merge the train data with label_features to mark For each device the label of the app used. train = pd.merge(train, label_features, how='left', on='device_id') train.fillna(-1, inplace=True) #add features: # 1 : #number of same app used per device: frame = pd.DataFrame(train.loc[:, ['device_id', 'app']].groupby( ['device_id', 'app']).size(), columns=['nbr_same_app']).reset_index() gc.collect() train = pd.merge(train, frame, on=['device_id', 'app'], how='left') #rectify the 1 in the number of same app for devices with no events train.loc[train.app == -1.0, 'nbr_same_app'] = -1 # 2 : #number of same label used per device: frame = pd.DataFrame(train.loc[:, ['device_id', 'label']].groupby( ['device_id', 'label']).size(), columns=['nbr_same_label']).reset_index() gc.collect() train = pd.merge(train, frame, on=['device_id', 'label'], how='left') #rectify the 1 in the number of same app for devices with no events train.loc[train.app == -1.0, 'nbr_same_label'] = -1 # 3 : #number of same category used per device: frame = pd.DataFrame(train.loc[:, ['device_id', 'category']].groupby( ['device_id', 'category']).size(), columns=['nbr_same_category']).reset_index() gc.collect() train = pd.merge(train, frame, on=['device_id', 'category'], how='left') #rectify the 1 in the number of same app for devices with no events train.loc[train.app == -1.0, 'nbr_same_category'] = -1 #4 : number of occurence of each device dict_device = dict(train.loc[:, ['device_id', 'app']].groupby( ['device_id'])['app'].agg(np.size)) train['device_occur'] = train.device_id.apply(dict_device.get) # 5 sum of labels dict_device = dict(train.loc[:, ['device_id', 'nbr_same_label']].groupby( ['device_id'])['nbr_same_label'].agg(np.sum)) train['sum_of_labels'] = train.device_id.apply(dict_device.get) #6 sum of app dict_device = dict(train.loc[:, ['device_id', 'nbr_same_app']].groupby( ['device_id'])['nbr_same_app'].agg(np.sum)) train['sum_of_app'] = train.device_id.apply(dict_device.get) #7 sum of category dict_device = dict(train.loc[:, ['device_id', 'nbr_same_category']].groupby([ 'device_id' ])['nbr_same_category'].agg(np.sum)) train['sum_of_category'] = train.device_id.apply(dict_device.get) #done from adding new features , drop the deplicated device_ids train.drop_duplicates('device_id', keep='first', inplace=True) train.reset_index(drop=True, inplace=True) train.loc[train.app != -1, ['app', 'label', 'category']] = 1 train.drop([ 'label-occurence', 'nbr_same_app', 'nbr_same_label', 'nbr_same_category' ], axis=1, inplace=True) # target target = train.group #drop the device id and target from the train data train.drop(['device_id', 'group'], axis=1, inplace=True) test_loader = pd.read_csv(path_test) test = pd.merge(test_loader, brand, how='left', on='device_id', left_index=True) # merge the test data with new set to mark For each device which apps it has installed #test = pd.merge(test , instaled_app , how = 'left',on='device_id') # merge the train data with label_features to mark For each device the label of the app used. test = pd.merge(test, label_features, how='left', on='device_id') test.fillna(-1, inplace=True) # the same for the test set , we add the previous features , i proceeded in this manner (each one , test and train , alone) for the fact of memory #add features: # 1 : #number of times of the same app used per device: frame = pd.DataFrame(test.loc[:, ['device_id', 'app']].groupby( ['device_id', 'app']).size(), columns=['nbr_same_app']).reset_index() gc.collect() test = pd.merge(test, frame, on=['device_id', 'app'], how='left') #rectify the 1 in the number of same app for devices with no events test.loc[test.app == -1.0, 'nbr_same_app'] = -1 # 2 : #number of times of the same label used per device: frame = pd.DataFrame(test.loc[:, ['device_id', 'label']].groupby( ['device_id', 'label']).size(), columns=['nbr_same_label']).reset_index() gc.collect() test = pd.merge(test, frame, on=['device_id', 'label'], how='left') #rectify the 1 in the number of times of the same app for devices with no events test.loc[test.app == -1.0, 'nbr_same_label'] = -1 # 3 : #number of times of the same category used per device: frame = pd.DataFrame(test.loc[:, ['device_id', 'category']].groupby( ['device_id', 'category']).size(), columns=['nbr_same_category']).reset_index() gc.collect() test = pd.merge(test, frame, on=['device_id', 'category'], how='left') #rectify the 1 in the number of times of the same app for devices with no events test.loc[test.app == -1.0, 'nbr_same_category'] = -1 #4 : number of occurence of each device dict_device = dict(test.groupby(['device_id'])['app'].agg(np.size)) test['device_occur'] = test.device_id.apply(dict_device.get) # 5 sum of labels dict_device = dict(test.loc[:, ['device_id', 'nbr_same_label']].groupby( ['device_id'])['nbr_same_label'].agg(np.sum)) test['sum_of_labels'] = test.device_id.apply(dict_device.get) #6 sum of app dict_device = dict(test.loc[:, ['device_id', 'nbr_same_app']].groupby( ['device_id'])['nbr_same_app'].agg(np.sum)) test['sum_of_app'] = test.device_id.apply(dict_device.get) #7 sum of category dict_device = dict(test.loc[:, ['device_id', 'nbr_same_category']].groupby( ['device_id'])['nbr_same_category'].agg(np.sum)) test['sum_of_category'] = test.device_id.apply(dict_device.get) test.drop_duplicates('device_id', keep='first', inplace=True) test.reset_index(drop=True, inplace=True) test.loc[test.app != -1, ['app', 'label', 'category']] = 1 test.drop([ 'device_id', 'label-occurence', 'nbr_same_app', 'nbr_same_label', 'nbr_same_category' ], axis=1, inplace=True) return train, test, target
def distance_matrix(data, numeric_distance="euclidean", categorical_distance="jaccard"): """ Compute the pairwise distance attribute by attribute in order to account for different variables type: - Continuous - Categorical For ordinal values, provide a numerical representation taking the order into account. Categorical variables are transformed into a set of binary ones. If both continuous and categorical distance are provided, a Gower-like distance is computed and the numeric variables are all normalized in the process. If there are missing values, the mean is computed for numerical attributes and the mode for categorical ones. Note: If weighted-hamming distance is chosen, the computation time increases a lot since it is not coded in C like other distance metrics provided by scipy. @params: - data = pandas dataframe to compute distances on. - numeric_distances = the metric to apply to continuous attributes. "euclidean" and "cityblock" available. Default = "euclidean" - categorical_distances = the metric to apply to binary attributes. "jaccard", "hamming", "weighted-hamming" and "euclidean" available. Default = "jaccard" @returns: - the distance matrix """ possible_continuous_distances = ["euclidean", "cityblock"] possible_binary_distances = [ "euclidean", "jaccard", "hamming", "weighted-hamming" ] number_of_variables = data.shape[1] number_of_observations = data.shape[0] # Get the type of each attribute (Numeric or categorical) is_numeric = [ all(isinstance(n, numbers.Number) for n in data.iloc[:, i]) for i, x in enumerate(data) ] is_all_numeric = sum(is_numeric) == len(is_numeric) is_all_categorical = sum(is_numeric) == 0 is_mixed_type = not is_all_categorical and not is_all_numeric # Check the content of the distances parameter if numeric_distance not in possible_continuous_distances: print("The continuous distance " + numeric_distance + " is not supported.") return None elif categorical_distance not in possible_binary_distances: print("The binary distance " + categorical_distance + " is not supported.") return None # Separate the data frame into categorical and numeric attributes and normalize numeric data if is_mixed_type: number_of_numeric_var = sum(is_numeric) number_of_categorical_var = number_of_variables - number_of_numeric_var data_numeric = data.iloc[:, is_numeric] data_numeric = (data_numeric - data_numeric.mean()) / ( data_numeric.max() - data_numeric.min()) data_categorical = data.iloc[:, [not x for x in is_numeric]] # Replace missing values with column mean for numeric values and mode for categorical ones. With the mode, it # triggers a warning: "SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame" # but the value are properly replaced if is_mixed_type: data_numeric.fillna(data_numeric.mean(), inplace=True) for x in data_categorical: data_categorical[x].fillna(data_categorical[x].mode()[0], inplace=True) elif is_all_numeric: data.fillna(data.mean(), inplace=True) else: for x in data: data[x].fillna(data[x].mode()[0], inplace=True) # "Dummifies" categorical variables in place if not is_all_numeric and not (categorical_distance == 'hamming' or categorical_distance == 'weighted-hamming'): if is_mixed_type: data_categorical = pd.get_dummies(data_categorical) else: data = pd.get_dummies(data) elif not is_all_numeric and categorical_distance == 'hamming': if is_mixed_type: data_categorical = pd.DataFrame([ pd.factorize(data_categorical[x])[0] for x in data_categorical ]).transpose() else: data = pd.DataFrame([pd.factorize(data[x])[0] for x in data]).transpose() if is_all_numeric: result_matrix = cdist(data, data, metric=numeric_distance) elif is_all_categorical: if categorical_distance == "weighted-hamming": result_matrix = weighted_hamming(data) else: result_matrix = cdist(data, data, metric=categorical_distance) else: result_numeric = cdist(data_numeric, data_numeric, metric=numeric_distance) if categorical_distance == "weighted-hamming": result_categorical = weighted_hamming(data_categorical) else: result_categorical = cdist(data_categorical, data_categorical, metric=categorical_distance) result_matrix = np.array([[ 1.0 * (result_numeric[i, j] * number_of_numeric_var + result_categorical[i, j] * number_of_categorical_var) / number_of_variables for j in range(number_of_observations) ] for i in range(number_of_observations)]) # Fill the diagonal with NaN values np.fill_diagonal(result_matrix, np.nan) return pd.DataFrame(result_matrix)
train_test.describe() ## Apply log transformation for feats in skewed_feats: train_test[feats] = train_test[feats] + 1 train_test[feats] = np.log(train_test[feats]) train_test.describe() #Identify categorical features features = train.columns cats = [feat for feat in features if 'cat' in feat] print(cats) # factorize categorical features for feat in cats: train_test[feat] = pd.factorize(train_test[feat], sort=True)[0] train_test['cat100'][:10] # Split back into test and train x_train = train_test.iloc[:ntrain, :] x_test = train_test.iloc[ntrain:, :] train = x_train.copy() test = x_test.copy() train.shape train.head() ## Transform target into np log of loss train_labels = np.log(np.array(train_loader['loss'])) train_ids = train_loader['id'].values.astype(np.int32)
def read_data(files, per_chromosome=False, chromosomes=[None], binarize=True, enable_collapse_strands=True, drop_ambiguous=True, outfile=None, verbose=True): ''' Reads all specified files and converts them to a sparse matrix in anndata format. Parameters: files: List of files to containing the individual cell assays. per_chromosome: boolean, whether to output an individual anndata file per_chromosome. Chromosomes are automatically inferred from the first file or can be specified in chromosomes. chromosomes: list of strings, which chromosomes to generate an output for. binarize: boolean, whether to return a binary methylation rate enable_collapse_strands: boolean, whether to sum reads from neighboring methylation sites drop_ambiguous: boolean, whether to drop sites with 0.5 methylation rate drop_rate_columns: boolean, whether columns met_reads and nonmet_reads should be removed outfile: str, Prefix name of output file verbose: boolean, whether verbose log output should be printed Returns: pd.DataFrame with added rate column ''' # Get cellnames from input files cellnames = [ re.sub('\\.csv|\\.txt|\\.tsv|\\.gz', '', os.path.basename(file)) for file in files ] # Find which chromosomes to read if per chromosome if per_chromosome: if chromosomes[0] is None: # Read first file to find which chromosomes exist chromosomes = pd.unique( pd.read_csv(files[0], sep='\t', usecols=[0], skiprows=1, header=None).iloc[:, 0]) # Read Files and generate anndata formatted files for chromosome in chromosomes: allmet = read_cells(files, chromosome) allmet = [ calculate_met_rate(met, binarize=binarize, enable_collapse_strands=enable_collapse_strands, drop_ambiguous=drop_ambiguous) for met in allmet ] if verbose: print('Concatenating...') allmet = pd.concat(allmet, keys=cellnames, copy=False) allmet.sort_values(['chr', 'location'], inplace=True) allmet.met_rate = allmet.met_rate + 1 print('Constructing reference index...') allmet['ind'] = make_genomic_index(allmet.location.values) rowcoord, rownames = pd.factorize(allmet.index.get_level_values( 0)) # Numerical representation of the index objects print('Constructing sparse matrix...') obs = pd.DataFrame(index=rownames) var = allmet.drop_duplicates(subset='ind')[['chr', 'location' ]].reset_index(drop=True) metmat = sparse.coo_matrix( (allmet.met_rate, (rowcoord, allmet['ind'])), dtype=np.int64, shape=(obs.shape[0], var.shape[0])) metmat = metmat.tocsc() print('Constructing anndata object...') a = anndata.AnnData(X=metmat, obs=obs, var=var, dtype=np.int32) if outfile is not None: print('Writing h5ad file') if chromosome is not None: chr_outfile = re.sub('$', '_chr_%s.h5ad' % chromosome, re.sub('.h5ad', '', outfile)) else: chr_outfile = outfile a.write(chr_outfile) a.file.close() if not per_chromosome: return a
from sklearn.ensemble import RandomForestRegressor import xgboost as xgb from scipy.stats import pearsonr #Read Allstate dataset allstate = pd.read_csv("train.csv") test = pd.read_csv("test.csv") allstate.head() test.head() #Factorize categorical variables feature_col = list(allstate.columns[1:-1]) cats = [name for name in feature_col if 'cat' in name] for name in cats: allstate[name] = pd.factorize(allstate[name], sort=True)[0] test[name] = pd.factorize(test[name], sort=True)[0] #Data Exploration len(cats) #Number of categorical variables len(feature_col) - len(cats) #Number of numerical variables allstate.isnull().sum().sum() #Check for missing value allstate['loss'].describe() #Range of target variable is huge #EDA on loss sns.distplot(allstate["loss"]) #We should scaling targer variable plt.ylabel("Density") sns.distplot(np.log(allstate["loss"])) #Ideal normal distribution plt.xlabel("Log(loss)") plt.ylabel("Density")
def train_test(num_rows=None): print("Loading datasets...") # load datasets train_df = pd.read_csv('../input/train.tsv', sep='\t', nrows=num_rows) test_df = pd.read_csv('../input/test.tsv', sep='\t', nrows=num_rows) print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df))) #testのtargetをnanにしときます test_df['visitors'] = np.nan # merge df = train_df.append(test_df[['datetime', 'park', 'visitors']]).reset_index() del train_df, test_df gc.collect() # 日付をdatetime型へ変換 df['datetime'] = pd.to_datetime(df['datetime']) # 日本の祝日データを追加 df['japanese_holiday'] = getJapaneseHolidays(df['datetime']).replace(2, 1) # 連休数のファクターを生成 holidays = df.groupby('datetime')['japanese_holiday'].mean().replace(2, 1) holidays = fillHolidays(holidays).replace(2, 1) # 休日の谷間の平日を休日にする df['num_holidays'] = df['datetime'].map(getNumHolidays(holidays)) # 季節性の特徴量を追加 df['day'] = df['datetime'].dt.day.astype(object) df['month'] = df['datetime'].dt.month.astype(object) df['weekday'] = df['datetime'].dt.weekday.astype(object) df['weekofyear'] = df['datetime'].dt.weekofyear.astype(object) # df['day_month'] = df['day'].astype(str)+'_'+df['month'].astype(str) # df['day_weekday'] = df['day'].astype(str)+'_'+df['weekday'].astype(str) # df['day_weekofyear'] = df['day'].astype(str)+'_'+df['weekofyear'].astype(str) df['month_weekday'] = df['month'].astype(str) + '_' + df['weekday'].astype( str) df['month_weekofyear'] = df['month'].astype( str) + '_' + df['weekofyear'].astype(str) # df['weekday_weekofyear'] = df['weekday'].astype(str)+'_'+df['weekofyear'].astype(str) df['new_years_day'] = getNewYearsDay(df['datetime']) df['golden_week'] = getGoldenWeek(df['datetime']) # df['park_day'] = df['park'].astype(str)+'_'+df['day'].astype(str) df['park_month'] = df['park'].astype(str) + '_' + df['month'].astype(str) df['park_weekday'] = df['park'].astype(str) + '_' + df['weekday'].astype( str) df['park_japanese_holiday'] = df['park'].astype( str) + '_' + df['japanese_holiday'].astype(str) # df['park_weekofyear'] = df['park'].astype(str)+'_'+df['weekofyear'].astype(str) df['park_num_holiday'] = df['park'].astype( str) + '_' + df['num_holidays'].astype(str) df['park_new_years_day'] = df['park'].astype( str) + '_' + df['new_years_day'].astype(str) df['park_golden_week'] = df['park'].astype( str) + '_' + df['golden_week'].astype(str) # categorical変数を変換 df_res, cat_cols = one_hot_encoder(df, nan_as_category=False) # stratify & mearge用 df_res['park'] = df['park'] df_res['weekofyear'] = df['weekofyear'].astype(int) df_res['weekday'] = df['weekday'].astype(int) df_res['year'] = df['datetime'].dt.year.astype(int) df_res['month'] = df['datetime'].dt.month.astype(int) df_res['park_month'], _ = pd.factorize(df['park_month']) df_res['park_japanese_holiday'], _ = pd.factorize( df['park_japanese_holiday']) # df_res['ISESHIMA_summit'] = ((df['park']=='伊勢志摩国立公園')&df['japanese_holiday']&('2016-5-27'>df['datetime'])&(df['datetime']>'2015-6-5')).astype(int) # 2016年伊勢島サミット開催決定後の休日フラグ return df_res
def scvi( adata: AnnData, n_hidden: int = 128, n_latent: int = 10, n_layers: int = 1, dispersion: str = "gene", n_epochs: int = 400, lr: int = 1e-3, train_size: int = 1.0, batch_key: Optional[str] = None, use_highly_variable_genes: bool = True, subset_genes: Optional[Sequence[Union[int, str]]] = None, linear_decoder: bool = False, copy: bool = False, use_cuda: bool = True, return_posterior: bool = True, trainer_kwargs: dict = {}, model_kwargs: dict = {}, ) -> Optional[AnnData]: """\ SCVI [Lopez18]_. Fits scVI model onto raw count data given an anndata object scVI uses stochastic optimization and deep neural networks to aggregate information across similar cells and genes and to approximate the distributions that underlie observed expression values, while accounting for batch effects and limited sensitivity. To use a linear-decoded Variational AutoEncoder model (implementation of [Svensson20]_.), set linear_decoded = True. Compared to standard VAE, this model is less powerful, but can be used to inspect which genes contribute to variation in the dataset. It may also be used for all scVI tasks, like differential expression, batch correction, imputation, etc. However, batch correction may be less powerful as it assumes a linear model. .. note:: More information and bug reports `here <https://github.com/YosefLab/scVI>`__. Parameters ---------- adata An anndata file with `X` attribute of unnormalized count data n_hidden Number of nodes per hidden layer n_latent Dimensionality of the latent space n_layers Number of hidden layers used for encoder and decoder NNs dispersion One of the following * `'gene'` - dispersion parameter of NB is constant per gene across cells * `'gene-batch'` - dispersion can differ between different batches * `'gene-label'` - dispersion can differ between different labels * `'gene-cell'` - dispersion can differ for every gene in every cell n_epochs Number of epochs to train lr Learning rate train_size The train size, either a float between 0 and 1 or an integer for the number of training samples to use batch_key Column name in anndata.obs for batches. If None, no batch correction is performed If not None, batch correction is performed per batch category use_highly_variable_genes If true, uses only the genes in anndata.var["highly_variable"] subset_genes Optional list of indices or gene names to subset anndata. If not None, use_highly_variable_genes is ignored linear_decoder If true, uses LDVAE model, which is an implementation of [Svensson20]_. copy If true, a copy of anndata is returned return_posterior If true, posterior object is returned use_cuda If true, uses cuda trainer_kwargs Extra arguments for UnsupervisedTrainer model_kwargs Extra arguments for VAE or LDVAE model Returns ------- If `copy` is true, anndata is returned. If `return_posterior` is true, the posterior object is returned If both `copy` and `return_posterior` are true, a tuple of anndata and the posterior are returned in that order. `adata.obsm['X_scvi']` stores the latent representations `adata.obsm['X_scvi_denoised']` stores the normalized mean of the negative binomial `adata.obsm['X_scvi_sample_rate']` stores the mean of the negative binomial If linear_decoder is true: `adata.uns['ldvae_loadings']` stores the per-gene weights in the linear decoder as a genes by n_latent matrix. """ try: from scvi.models import VAE, LDVAE from scvi.inference import UnsupervisedTrainer from scvi.dataset import AnnDatasetFromAnnData except ImportError: raise ImportError( "Please install scvi package from https://github.com/YosefLab/scVI" ) # check if observations are unnormalized using first 10 # code from: https://github.com/theislab/dca/blob/89eee4ed01dd969b3d46e0c815382806fbfc2526/dca/io.py#L63-L69 if len(adata) > 10: X_subset = adata.X[:10] else: X_subset = adata.X norm_error = ( 'Make sure that the dataset (adata.X) contains unnormalized count data.' ) if sp.sparse.issparse(X_subset): assert (X_subset.astype(int) != X_subset).nnz == 0, norm_error else: assert np.all(X_subset.astype(int) == X_subset), norm_error if subset_genes is not None: adata_subset = adata[:, subset_genes] elif use_highly_variable_genes and "highly_variable" in adata.var: adata_subset = adata[:, adata.var["highly_variable"]] else: adata_subset = adata if batch_key is not None: codes, uniques = pd.factorize(adata_subset.obs[batch_key]) adata_subset.obs['_tmp_scvi_batch'] = codes n_batches = len(uniques) else: n_batches = 0 dataset = AnnDatasetFromAnnData(adata_subset.copy(), batch_label='_tmp_scvi_batch') if linear_decoder: vae = LDVAE( n_input=dataset.nb_genes, n_batch=n_batches, n_labels=dataset.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers_encoder=n_layers, dispersion=dispersion, **model_kwargs, ) else: vae = VAE( dataset.nb_genes, n_batch=n_batches, n_labels=dataset.n_labels, n_hidden=n_hidden, n_latent=n_latent, n_layers=n_layers, dispersion=dispersion, **model_kwargs, ) trainer = UnsupervisedTrainer( model=vae, gene_dataset=dataset, use_cuda=use_cuda, train_size=train_size, **trainer_kwargs, ) trainer.train(n_epochs=n_epochs, lr=lr) full = trainer.create_posterior( trainer.model, dataset, indices=np.arange(len(dataset)) ) latent, batch_indices, labels = full.sequential().get_latent() if copy: adata = adata.copy() adata.obsm['X_scvi'] = latent adata.obsm['X_scvi_denoised'] = full.sequential().get_sample_scale() adata.obsm['X_scvi_sample_rate'] = full.sequential().imputation() if linear_decoder: loadings = vae.get_loadings() df = pd.DataFrame(loadings, index=adata_subset.var_names) adata.uns['ldvae_loadings'] = df if copy and return_posterior: return adata, full elif copy: return adata elif return_posterior: return full
from sklearn import preprocessing import matplotlib.pyplot as plt import os import pickle from sklearn.model_selection import GridSearchCV from time import time from scipy.stats import randint as sp_randint #=================================================================== # Load Data os.chdir(os.getenv("HOME") + "/Desktop/Dropbox/CS229/Project") d = pd.read_table("./data/svz_data.txt", sep=",") d = d.drop("Unnamed: 0", axis=1) y = pd.factorize(d.Age)[0] X = d.iloc[:, 7:] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2, shuffle=True) #=================================================================== # Basic rf on full dataset classifier = RandomForestClassifier(n_estimators=200, n_jobs=3, random_state=0) classifier.fit(X_train, y_train) acc_train = classifier.score(X_train, y_train) # 0.72 acc_test = classifier.score(X_test, y_test) # 0.71 pred = classifier.predict(X_test) #===================================================================
# importing packages import pandas as pd import numpy as np # preprocessing the data data = pd.read_csv('spam.csv', encoding='latin-1') data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1) # encoding Data data['label'] = pd.factorize(data['label'])[0] # module for removing unwanted words import re import nltk nltk.download("stopwords") from nltk.corpus import stopwords # for stemming words from nltk.stem.porter import PorterStemmer temp = [] for row in data.itertuples(): # to keep a - z letters and 0 - 9 rev = re.sub("[^0-9a-zA-Z]", " ", row[2]) rev = rev.lower() rev = rev.split() ps = PorterStemmer() rev = [ ps.stem(word) for word in rev if not word in set(stopwords.words("english")) ] rev = " ".join(rev) temp.append(rev) data['msg'] = temp
def casePlot(case_index: List[int], start: List[dt.date], end: List[dt.date], hue: List[str] = None, hue_lines: List[str] = None, hue_lines_colors: List[str] = None, na_values=dt.date.today(), ax: Axes = None): if ax is None: fig, ax = plt.subplots() if type(start) != list: start = list(start) if type(end) != list: end = list(end) if type(case_index) != list: case_index = list(case_index) # samling ad datoer, og case_index date = start + end case_index = case_index + case_index # Plot - get dots if hue is None: sns.scatterplot(x=date, y=case_index, color='blue', ax=ax) else: if type(hue) != list and hue is not None: hue = list(hue) hue = hue + hue sns.scatterplot(x=date, y=case_index, hue=hue, ax=ax) # Lines in plt isUnfinished = pd.isnull(end) end__ = [ pd.Timestamp(na_values) if unfinished else e for e, unfinished in zip(end, isUnfinished) ] if hue_lines is None: plt.hlines(case_index, xmin=start, xmax=end__) # if hue_lines non-empty elif hue_lines is not None: if hue_lines_colors is None: hue_lines_colors = [ "#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71" ] for i in range(10): hue_lines_colors = hue_lines_colors + hue_lines_colors cols = hue_lines_colors mapIntToCol = dict(enumerate(cols)).get hue_line_colors = list(map(mapIntToCol, factorize(hue_lines)[0])) hue_lines_unique = uniqueList(hue_lines) plt.hlines(case_index, xmin=start, xmax=end__, colors=hue_line_colors, zorder=1) # legend handles_lines = [ mlines.Line2D([], [], color=hue_lines_colors[i], marker='_', markersize=15, label=label) for i, label in enumerate(hue_lines_unique) ] if hue is not None: ax.legend(handles=ax.legend_.legendHandles + handles_lines) else: ax.legend(handles=handles_lines) ax.set(xlabel='Year', ylabel='Case Index') plt.gca().invert_yaxis() plt.yticks(ticks=case_index, labels=case_index)
def convert_data(df): df.Sex[df['Sex'] == 'male'] = 1 df.Sex[df['Sex'] == 'female'] = 0 df['fare_bin'] = pd.qcut(df.Fare, 4) df['fare_id'] = pd.factorize(df.fare_bin)[0] + 1 df['embarked_id'] = pd.factorize(df.Embarked)[0] + 1
def evaluate(predictor, args): train_dir = args.train_dir train_file = args.filename test_file = train_file.replace('train', 'test', 1) target = args.target training_job_name = args.training_job_name s3_output = args.s3_output presets = args.presets dataset_name = train_file.split('_')[0] logging.info(dataset_name) test_data = TabularDataset(os.path.join(train_dir, test_file)) u = urlparse(s3_output, allow_fragments=False) bucket = u.netloc logging.info(bucket) prefix = u.path.strip('/') logging.info(prefix) s3 = boto3.client('s3') y_test = test_data[target] test_data_nolab = test_data.drop(labels=[target], axis=1) y_pred = predictor.predict(test_data_nolab) y_pred_df = pd.DataFrame.from_dict({'True': y_test, 'Predicted': y_pred}) pred_file = f'{dataset_name}_test_predictions.csv' y_pred_df.to_csv(pred_file, index=False, header=True) leaderboard = predictor.leaderboard() lead_file = f'{dataset_name}_leaderboard.csv' leaderboard.to_csv(lead_file) perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True) #del perf['confusion_matrix'] perf_file = f'{dataset_name}_model_performance.txt' with open(perf_file, 'w') as f: print(json.dumps(perf, indent=4, default=pd.DataFrame.to_json), file=f) summary = predictor.fit_summary() summ_file = f'{dataset_name}_fit_summary.txt' with open(summ_file, 'w') as f: print(summary, file=f) y_prob = predictor.predict_proba(test_data_nolab) y_prob = y_prob.iloc[:,-1] y_test_enc, uniques = pd.factorize(y_test) # Label Encoding fig = plt.figure(figsize=(14,4)) plt.subplot(1,3,1) plot_roc_curve(y_test_enc, y_prob) plt.subplot(1,3,2) plot_pr_curve(y_test_enc, y_prob) plt.subplot(1,3,3) plot_conf_mtx(y_test_enc, y_prob, 0.5) eval_file = f'{dataset_name}_eval.png' plt.savefig(eval_file) plt.close(fig) # # Feature importance # featimp = predictor.feature_importance(test_data) # fig, ax = plt.subplots(figsize=(12,5)) # plot = sns.barplot(x=featimp.index, y=featimp.values) # ax.set_title('Feature Importance') # plot.set_xticklabels(plot.get_xticklabels(), rotation='vertical') # featimp_imgfile = f'{dataset_name}_featimp.png' # featimp_csvfile = f'{dataset_name}_featimp.csv' # fig.savefig(featimp_imgfile) # featimp.to_csv(featimp_csvfile) # plt.close(fig) # Cleanup data in order to avoid disk space issues predictor.save_space() predictor.delete_models(models_to_keep='best', dry_run=False) files_to_upload = [pred_file, lead_file, perf_file, summ_file, eval_file] for file in files_to_upload: s3.upload_file(file, bucket, os.path.join(prefix, training_job_name.replace('mxnet-training', 'autogluon', 1), file))
import numpy as np iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75 df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names) print(len(df)) train, test = df[df['is_train'] == True], df[df['is_train'] == False] features = df.columns[:4] #print(train[features].values.tolist()) #print(test.loc[2])#[4.9,3,1.4,0.2] #模型 clf = RandomForestClassifier(n_estimators=100, n_jobs=2) y, _ = pd.factorize(train['species']) #print(pd.factorize(train['species'])) #y2,_pd.factorize(test['species']) #训练 clf.fit(train[features], y) aa = clf.predict([[6.5, 3.0, 5.2, 2.0]]) print(aa) preds = iris.target_names[clf.predict(test[features])] #print(preds) y2 = pd.crosstab(test['species'], preds, rownames=['actual'], colnames=['preds'])
from sklearn.metrics import f1_score #extract accident_id, 1st_road_class, speed_limit, road_surface_conditions, target accident_columns = [ "accident_id", "1st_road_class", "speed_limit", "road_surface_conditions", "time" ] vehicle_columns = ["accident_id", "Vehicle_Type", "Age_of_Driver" ] #note, age of driver isn't always available accidents_train = pd.read_csv("data/train.csv", usecols=accident_columns + ["target"]) accidents_unseen = pd.read_csv("data/test.csv", usecols=accident_columns) vehicles = pd.read_csv("data/vehicles.csv", usecols=vehicle_columns) #split features and target target = pd.factorize(accidents_train["target"])[0] train = accidents_train.drop(axis=1, labels=["target"]) #---------------------preproccesing--------------------------# #join and drop empty values (driver age) print(accidents_unseen.shape) train = train.merge(vehicles, on='accident_id', suffixes=('_accidents', '_vehicles')).fillna(999) unseen = accidents_unseen.merge(vehicles, on='accident_id', suffixes=('_accidents', '_vehicles')).fillna(999) print(train.shape) #create 24 hour buckets
def prepare_data(df1): df=df1.copy() df['DOB1']=df['DOB'].map(lambda x: datetime(1900+int(x[6:]),int(x[3:5]),int(x[:2])) if len(str(x))>=8 else datetime(1997,1,1)) df['Lead_Creation_Date1']=df['Lead_Creation_Date'].map(lambda x: datetime(2000+int(x[6:]),int(x[3:5]),int(x[:2])) if len(str(x))>=8 else datetime(1997,1,1)) # for i in range(12): # df['FEAT_ID_%d'%i],_=pd.factorize(df.index.map(lambda x: x[3+i])) df['FEAT_GENDER'],_=pd.factorize(df['Gender']) df['FEAT_DOB_YEAR'],_=pd.factorize(df['DOB1'].dt.year) df['FEAT_DOB_MONTH'],_=pd.factorize(df['DOB1'].dt.month) df['FEAT_LCD_MONTH'],_=pd.factorize(df['Lead_Creation_Date1'].dt.month) df['FEAT_LCD_WEEK'],_=pd.factorize(df['Lead_Creation_Date1'].dt.week) df['FEAT_LCD_DOW'],_=pd.factorize(df['Lead_Creation_Date1'].dt.dayofweek) df['FEAT_DOB_MONTH'],_=pd.factorize(df['DOB1'].dt.month) df['FEAT_DOB_LCD_MONTH_DIFF'],_=pd.factorize(df['Lead_Creation_Date1'].dt.month-df['DOB1'].dt.month) df['FEAT_DOB_LCD_WEEK_DIFF'],_=pd.factorize(np.clip(df.Lead_Creation_Date1.dt.week-df.DOB1.dt.week,-10,10)) df['FEAT_DOB_LCD_DAY_DIFF'],_=pd.factorize(np.clip(df.Lead_Creation_Date1.dt.dayofyear-df.DOB1.dt.dayofyear,-10,10)) for i in range(6): df['FEAT_CITY1_%d'%i],_=pd.factorize(df['City_Code'].map(lambda x: str(x)[:i+1])) df['FEAT_CITY2'],_=pd.factorize(df['City_Category']) for i in range(4): df['FEAT_EMP1_%d'%i],_=pd.factorize(df['Employer_Code'].map(lambda x: str(x)[:-i-1])) df['FEAT_EMP2'],_=pd.factorize(df['Employer_Category1']) df['FEAT_EMP3'],_=pd.factorize(df['Employer_Category2']) df['FEAT_INCOME1']=np.clip(np.log1p(df['Monthly_Income']),0,10).astype(np.int64) df['CONT_FEAT_INCOME1']=np.clip(df['Monthly_Income'].fillna(-99999),-99999,9500) df['FEAT_INCOME2']=df['Monthly_Income'].astype(np.int64)%10 df['FEAT_BANK1'],_=pd.factorize(df['Customer_Existing_Primary_Bank_Code']) df['FEAT_BANK2'],_=pd.factorize(df['Customer_Existing_Primary_Bank_Code'].map(lambda x: str(x)[:-1])) df['FEAT_BANK3'],_=pd.factorize(df['Primary_Bank_Type']) df['FEAT_CONTACT1'],_=pd.factorize(df['Contacted']) df['FEAT_CONTACT2'],_=pd.factorize(df['Source']) df['FEAT_CONTACT3'],_=pd.factorize(df['Source'].map(lambda x: x[:3])) df['FEAT_CONTACT4'],_=pd.factorize(df['Source'].map(lambda x: x[3])) df['FEAT_CONTACT5'],_=pd.factorize(df['Source_Category']) df['FEAT_EMI1'],_=pd.factorize(df['Existing_EMI']==0) df['FEAT_EMI2'],_=pd.factorize(pd.cut(df['Existing_EMI'].fillna(-99999),[-99999,0,100,300,600,1200,2400,3600,10000000])) df['FEAT_EMI3'],_=pd.factorize(df['EMI']==0) df['FEAT_EMI4'],_=pd.factorize(pd.cut(df['EMI'].fillna(-99999),[-99999,0,100,300,600,1200,2400,3600,10000000])) df['FEAT_LOAN1'],_=pd.factorize(df['Loan_Amount'].isnull()) df['FEAT_LOAN2'],_=pd.factorize(pd.cut(df['Loan_Amount'].fillna(-1),[-100,-1,0,5000,10000,20000,30000,50000,100000,10000000])) df['CONT_FEAT_LOAN1']=np.clip(df['Loan_Amount'].fillna(-99999),-99999,100000) df['CONT_FEAT_LOAN2']=df[['Loan_Amount','Loan_Period','Existing_EMI']].apply(lambda x: x[0]/(x[1]*x[2]) if x[1]>0 and x[2]>0 else -99999,axis=1).fillna(-99999) df['CONT_FEAT_LOAN3']=df['Interest_Rate'].fillna(-99999) df['CONT_FEAT_EMI1']=df['Existing_EMI'].fillna(-99999) df['CONT_FEAT_EMI2']=df['EMI'].fillna(-99999) df['FEAT_LOAN3'],_=pd.factorize(df['Loan_Period'].fillna(-1)) df['FEAT_LOAN4'],_=pd.factorize((df['Interest_Rate'].fillna(-10)/5).astype(np.int64)) df['FEAT_VAR1'],_=pd.factorize(df['Var1']) for col in [x for x in df.columns if x[:4] in ('FEAT')]: a=df[col].value_counts() df[col.replace('FEAT','COUNT')]=df[col].map(a) if a.min()<=100: a=a[a>100] a1=df[col].astype('category') a2=df[col].map(a).fillna(-99999).astype('category') df[col]=a1 df[col+'_CLEAN']=a2 else: df[col]=df[col].astype('category') return df[[x for x in df.columns if x[:4] in ('FEAT','Appr','CONT','COUN')]]
text_data_file = open(file_name, "rb") category_text_data = pickle.load(text_data_file) #print("Control point 4") #print(process.memory_info()[0]) category_merged_df = pd.merge(merged_df, category_text_data, how="inner", on=["id"]) #print("Control point 5") #print(process.memory_info()[0]) category_merged_df['teacher_prefix_code'] = pd.factorize( category_merged_df['teacher_prefix'])[0] category_merged_df['project_grade_category_code'] = pd.factorize( category_merged_df['project_grade_category'])[0] category_merged_df['school_state_code'] = pd.factorize( category_merged_df['school_state'])[0] #print("Control point 6") #print(process.memory_info()[0]) columns_list = [ "teacher_prefix_code", "project_grade_category_code", "month", "quarter", "teacher_number_of_previously_posted_projects", "total_price", "school_state_code" ] if True: #include_text_data:
y= data['No_Val_Available_8'].values.T #y= data['No_Val_Available_8'].values.reshape([1,-1]).T from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, train_size = .8, shuffle=True, random_state =1) plt.scatter( X_train['DOM:'], X_train['Original:'] , c=pd.factorize(y_train)[0], cmap= plt.cm.coolwarm ) plt.title("Test data community values vs price") plt.show() # Import a support vector classifier from sklearn.svm import LinearSVC # Instantiate this model
c_select = select.copy() c_select[j] = i X = x[c_select].values Y = circleradviz(X, anchor) score = metrics.silhouette_score(Y, y, metric='sqeuclidean') if (best_score < score): best_score = score best_select = c_select return (best_score, best_select) classes = np.unique(y) colors = matplotlib.pyplot.cm.rainbow(np.linspace(0, 1, len(classes))) cm = matplotlib.colors.ListedColormap(colors) col = x.columns.values cy = pd.factorize(y)[0] #number of genes m = 6 (score, select) = GeneSelection(m, x, y) X = x[select].values anchor = DimensionAnchor(m) Y = circleradviz(X, anchor) best_score = metrics.silhouette_score(Y, y, metric='sqeuclidean') print('Score: ', best_score) plt.figure(figsize=(6, 6)) cy = pd.factorize(y)[0] t = np.linspace(0, 2 * np.pi, 100)
length = [] # load a wave data def load_wave_data(audio_dir, file_name): file_path = os.path.join(audio_dir, file_name) # x, fs = librosa.load(file_path, sr=16000) wf = wave.open(audio_dir + "/" + file_name + ".wav", "r") length.append(float(wf.getnframes()) / wf.getframerate()) r = float(wf.getnframes()) / wf.getframerate() return r meta_data = pd.read_table("sample_submit.tsv") labels, uniques = pd.factorize(meta_data['target']) meta_data['target'] = labels print(meta_data) data_size = meta_data.shape # arrange target label and its name class_dict = meta_data["target"].unique() # print(class_dict) # get training dataset and target dataset x = list(meta_data.loc[:, "fileName"]) y = list(meta_data.loc[:, "target"]) for i in range(len(y)): len = load_wave_data("test", x[i]) with open("testLength.csv", "w", newline="") as f:
def prepare_data(path, config): thresh = config.get('Evaluation', 'FilterThresh') data = pd.read_csv(path + '/abundance.tsv', index_col=0, sep='\t', header=None) labels = np.genfromtxt(path + '/labels.txt', dtype=np.str_, delimiter=',') core_filt_thresh = float(thresh) opp_filt_thresh = 0.0 data = data.transpose() sums = data.sum(axis=1) data = data.divide(sums, axis=0) labels, label_set = pd.factorize(labels) pos_set = data.iloc[np.where(labels == 1)] neg_set = data.iloc[np.where(labels == 0)] core = filter_data(data, labels, core_filt_thresh, opp_filt_thresh) data = core features = list(data.columns.values) print("There are %d raw features..." % (len(features))) features_df = get_feature_df(features) print("Building tree structure...") try: g = pickle.load( open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'rb')) print("Found tree file...") except: print("Tree file not found...") print("Contsructing tree..") g = Graph() g.build_graph() g.prune_graph(features_df) pickle.dump( g, open(path + "/PopPhy-tree-" + str(core_filt_thresh) + "-core.pkl", 'wb')) print("Populating trees...") results = Parallel(n_jobs=num_cores)( delayed(generate_maps)(x, g, features_df) for x in data.values) my_maps = np.array(np.take(results, 1, 1).tolist()) counts = np.count_nonzero(my_maps, axis=0) my_benchmark = np.array(np.take(results, 0, 1).tolist()) my_benchmark_tree = np.array(np.take(results, 2, 1).tolist()) tree_features = g.graph_vector_features() my_benchmark_df = pd.DataFrame(index=tree_features, data=np.transpose(my_benchmark_tree)) my_benchmark_df = my_benchmark_df.groupby(my_benchmark_df.index).mean() tree_features = my_benchmark_df.index my_benchmark_tree = np.transpose(my_benchmark_df.values) num_tree_features = len(tree_features) print("There are %d tree features..." % (num_tree_features)) return my_maps, my_benchmark, my_benchmark_tree, features, tree_features, labels, label_set, g, features_df
data = [trace2,trace1] fig = go.Figure(data=data,layout=layout) py.iplot(fig) # Plot plot_radar(dat_rad,1,"Churn - Customers") plot_radar(dat_rad,0,"Non Churn - Customers") ###### Correlation matrix plt.figure(figsize=(12, 6)) telcom.drop(['customerID'],axis=1, inplace=True) corr = telcom.apply(lambda x: pd.factorize(x)[0]).corr() ax = sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, linewidths=.2, cmap="YlGnBu") # Other Correlation matrix correlation = telcom.corr() matrix_cols = correlation.columns.tolist() #tick labels corr_array = np.array(correlation) #convert to array #Plotting trace = go.Heatmap(z = corr_array, x = matrix_cols, y = matrix_cols, colorscale = "Viridis", colorbar = dict(title = "Pearson Correlation coefficient",
import pandas as pd import numpy as np data = pd.read_csv("final.csv",encoding = "ISO-8859-1") data.head() data=data.drop(['Id', 'Company','Educational details', 'jobdescription', 'jobid', 'numberofpositions', 'payrate'], axis=1) feature_cols = ['Education', 'Experience', 'jobtitle', 'loc_1'] X = data[feature_cols] label = ['industry'] y = data[label] data['loc_1']=pd.factorize(data.loc_1)[0] data['industry']=pd.factorize(data.industry)[0] data['Education']=pd.factorize(data.Education)[0] data['jobtitle']=pd.factorize(data.jobtitle)[0] data.head() X = data.iloc[:, 0:4].values y = data.iloc[:, 4].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler
def hash_array(vals, encoding="utf8", hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals
def MICAProcess(data, state_column, current_column, max_states, batch_size, window_size, sampleRate, stateDict, expected_input_shape, expected_output_shape): """ Function that turns data into a minmax scaled, batched dataset ready for input into a deep learning model, with options for train test splitting. Also includes "max state consideration" - adds a number of "empty" states to one hot encoding to allow for consistent model input size across different state models """ # One hot encoding for state column # Temp fix for missing states num_states = max_states # AAAHHH HORRIBLE CODE!!!!! # if 'Viterbi' in data.columns: # if type(data[state_column][0]) == type('string'): # temp_data = pd.DataFrame([[0, list(stateDict.keys())[i], 0, list(stateDict.keys())[i], 0, 0] for i in range(max_states)], columns=data.columns) # else: # # Holy HELL this is a bad idea. # temp_data = pd.DataFrame([[0, i, 0, i, 0, 0] for i in range(max_states)], columns=data.columns) # else: # temp_data = pd.DataFrame([[0,i,0,0,0] for i in range(max_states)], columns=data.columns) # # data = data.append(temp_data) states, _ = pd.factorize(data[state_column], sort=True) data[state_column] = states cat_labels = pd.get_dummies(data[state_column], prefix="state_") """ # Max state consideration - add 0 rows to make up for max states for missing_state in range(num_states, max_states): cat_labels["state" + str(missing_state)] = 0 """ # Replace labels column with new cat_labels dataframe data.drop([state_column], axis=1, inplace=True) xdata = data ydata = cat_labels # Truncate data for batching dataSize = len(xdata) // (batch_size * window_size) * (batch_size * window_size) xdata, ydata = xdata[ current_column][:dataSize].values, ydata[:dataSize].values # Minmax scaling minmax = MinMaxScaler() xdata = minmax.fit_transform(xdata.reshape(-1, 1)) # Reshaping dat afor use in batched input xdata = xdata.reshape(-1, batch_size, *expected_input_shape) ydata = ydata.reshape(-1, batch_size, *expected_output_shape) return PPData(x_data=xdata, y_data=ydata, scaler=minmax, sampleRate=sampleRate, batch_size=batch_size, window_size=window_size, max_states=max_states, stateDict=stateDict)
if args.data == None: print("Please specify raw or latent for data flag") else: dataset=args.data svm_accuracy = [] svm_roc_auc = [] svm_precision = [] svm_recall = [] svm_f_score = [] svm_pred = [] svm_prob = [] svm_mcc = [] fp = pd.read_csv("diabimmune_metadata_allcountries_allergy_noQuotes.csv", index_col=3) allergy = fp["allergy"] allergy = pd.factorize(allergy) subject = fp["subjectID"] labels = allergy[1] allergy = allergy[0] subject_data = {'ID': subject, 'label': allergy} split_df = pd.DataFrame(data=subject_data).groupby("ID").median() split_sub = split_df.index.values split_lab = np.array(split_df[["label"]].as_matrix()).reshape(-1) print(len(split_sub)) print(len(split_lab)) if dataset == "latent":
def main(): train = read_train_data(path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train/')) test = read_test_data(path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test/')) if adoption_shuffle: train['AdoptionSpeed'] = random.sample(train['AdoptionSpeed'].values.tolist(), len(train)) if densenet_predict: dnet_model = densenet_model(weight_path=os.path.join(input.__path__[0], 'densenet-keras/DenseNet-BC-121-32-no-top.h5')) train_feats = predict_using_img(dnet_model, train, img_path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_images/')) test_feats = predict_using_img(dnet_model, test, img_path=os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_images/')) train_feats.to_pickle('densenet_train_predict.pkl') test_feats.to_pickle('densenet_test_predict.pkl') else: with open('./densenet_train_predict.pkl', 'rb') as f: train_feats = pickle.load(f) with open('./densenet_test_predict.pkl', 'rb') as f: test_feats = pickle.load(f) all_ids = pd.concat([train, test], axis=0, ignore_index=True, sort=False)[['PetID']] svd_col = adopt_svd(train_feats, test_feats) img_features = pd.concat([all_ids, svd_col], axis=1) labels_breed = pd.read_csv(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/breed_labels.csv')) labels_color = pd.read_csv(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/color_labels.csv')) labels_state = pd.read_csv(os.path.join(input.__path__[0], 'my_state_labels/my_state_labels.csv')) train_image_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_images/*.jpg'))) train_metadata_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_metadata/*.json'))) train_sentiment_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/train_sentiment/*.json'))) test_image_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_images/*.jpg'))) test_metadata_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_metadata/*.json'))) test_sentiment_files = sorted(glob.glob(os.path.join(input.__path__[0], 'petfinder-adoption-prediction/test_sentiment/*.json'))) # Metadata: train_df_metadata = pd.DataFrame(train_metadata_files) train_df_metadata.columns = ['metadata_filename'] train_df_sentiment = pd.DataFrame(train_sentiment_files) train_df_sentiment.columns = ['sentiment_filename'] # Metadata: test_df_metadata = pd.DataFrame(test_metadata_files) test_df_metadata.columns = ['metadata_filename'] test_df_sentiment = pd.DataFrame(test_sentiment_files) test_df_sentiment.columns = ['sentiment_filename'] train_pet_ids = train.PetID.unique() test_pet_ids = test.PetID.unique() if exe_extract_additional_feature: dfs_train = Parallel(n_jobs=12, verbose=1)( delayed(extract_additional_features)(i, mode='train') for i in train_pet_ids) dfs_test = Parallel(n_jobs=12, verbose=1)( delayed(extract_additional_features)(i, mode='test') for i in test_pet_ids) train_dfs_sentiment = [x[0] for x in dfs_train if isinstance(x[0], pd.DataFrame)] train_dfs_metadata = [x[1] for x in dfs_train if isinstance(x[1], pd.DataFrame)] train_dfs_sentiment = pd.concat(train_dfs_sentiment, ignore_index=True, sort=False) train_dfs_metadata = pd.concat(train_dfs_metadata, ignore_index=True, sort=False) test_dfs_sentiment = [x[0] for x in dfs_test if isinstance(x[0], pd.DataFrame)] test_dfs_metadata = [x[1] for x in dfs_test if isinstance(x[1], pd.DataFrame)] test_dfs_sentiment = pd.concat(test_dfs_sentiment, ignore_index=True, sort=False) test_dfs_metadata = pd.concat(test_dfs_metadata, ignore_index=True, sort=False) train_dfs_metadata.to_pickle('train_dfs_metadata.pkl') train_dfs_sentiment.to_pickle('train_dfs_sentiment.pkl') test_dfs_metadata.to_pickle('test_dfs_metadata.pkl') test_dfs_sentiment.to_pickle('test_dfs_sentiment.pkl') else: with open('./train_dfs_metadata.pkl', 'rb') as f: train_dfs_metadata = pickle.load(f) with open('./train_dfs_sentiment.pkl', 'rb') as f: train_dfs_sentiment = pickle.load(f) with open('./test_dfs_metadata.pkl', 'rb') as f: test_dfs_metadata = pickle.load(f) with open('./test_dfs_sentiment.pkl', 'rb') as f: test_dfs_sentiment = pickle.load(f) # ### group extracted features by PetID: train_proc = agg_feature(train, train_dfs_metadata, train_dfs_sentiment) test_proc = agg_feature(test, test_dfs_metadata, test_dfs_sentiment) train_proc = merge_labels_breed(train_proc, labels_breed) test_proc = merge_labels_breed(test_proc, labels_breed) train_proc, test_proc = merge_labels_state(train_proc, test_proc, labels_state) train_proc = fill_and_drop_feature(train_proc) test_proc = fill_and_drop_feature(test_proc) train_proc = add_feature(train_proc) test_proc = add_feature(test_proc) X = pd.concat([train_proc, test_proc], ignore_index=True, sort=False) X_temp = X.copy() text_columns = ['Description', 'metadata_annots_top_desc', 'sentiment_entities'] categorical_columns = ['main_breed_BreedName', 'second_breed_BreedName'] to_drop_columns = ['PetID', 'Name', 'RescuerID'] rescuer_count = X.groupby(['RescuerID'])['PetID'].count().reset_index() rescuer_count.columns = ['RescuerID', 'RescuerID_COUNT'] X_temp = X_temp.merge(rescuer_count, how='left', on='RescuerID') for i in categorical_columns: try: X_temp.loc[:, i] = pd.factorize(X_temp.loc[:, i])[0] except: pass X_text = X_temp[text_columns] for i in X_text.columns: X_text.loc[:, i] = X_text.loc[:, i].fillna('none') X_temp['Length_Description'] = X_text['Description'].map(len) X_temp['Length_metadata_annots_top_desc'] = X_text['metadata_annots_top_desc'].map(len) X_temp['Lengths_sentiment_entities'] = X_text['sentiment_entities'].map(len) X_temp = parse_tfidf(X_temp, X_text) X_temp = X_temp.merge(img_features, how='left', on='PetID') agg_train_imgs = agg_img_feature(train_image_files) agg_test_imgs = agg_img_feature(test_image_files) agg_imgs = pd.concat([agg_train_imgs, agg_test_imgs], axis=0).reset_index(drop=True) X_temp = X_temp.merge(agg_imgs, how='left', on='PetID') # ### Drop ID, name and rescuerID X_temp = X_temp.drop(to_drop_columns, axis=1) X_train = X_temp.loc[np.isfinite(X_temp.AdoptionSpeed), :] X_test = X_temp.loc[~np.isfinite(X_temp.AdoptionSpeed), :] X_test = X_test.drop(['AdoptionSpeed'], axis=1) assert X_train.shape[0] == train.shape[0] assert X_test.shape[0] == test.shape[0] train_cols = X_train.columns.tolist() train_cols.remove('AdoptionSpeed') test_cols = X_test.columns.tolist() assert np.all(train_cols == test_cols) X_train_non_null = X_train.fillna(-1) X_test_non_null = X_test.fillna(-1) X_train_non_null.isnull().any().any(), X_test_non_null.isnull().any().any() xgb_params = { 'eval_metric': 'rmse', 'object':'reg:squarederror', 'seed': 1337, 'eta': 0.0123, 'subsample': 0.8, 'colsample_bytree': 0.85, 'tree_method': 'gpu_hist', 'device': 'gpu', 'silent': 1, } X_train_non_null = fill_and_drop_feature_end(X_train_non_null) X_test_non_null = fill_and_drop_feature_end(X_test_non_null) X_train_non_null.to_csv('./X_train.csv') model, oof_train, oof_test, feature_score = run_xgb(xgb_params, X_train_non_null, X_test_non_null) optR = OptimizedRounder() optR.fit(oof_train, X_train['AdoptionSpeed'].values) coefficients = optR.coefficients() valid_pred = optR.predict(oof_train, coefficients) qwk = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred) print("QWK = ", qwk) coefficients_ = coefficients.copy() coefficients_[0] = 1.66 coefficients_[1] = 2.13 coefficients_[3] = 2.85 train_predictions = optR.predict(oof_train, coefficients_).astype(np.int8) test_predictions = optR.predict(oof_test.mean(axis=1), coefficients_).astype(np.int8) valid_pred = optR.predict(oof_train, coefficients_) qwk_change = quadratic_weighted_kappa(X_train['AdoptionSpeed'].values, valid_pred) print("QWK_change = ", qwk_change) submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions}) submission.to_csv('submission.csv', index=False) str_metric_score = 'qwk' + '_0' + str(int(qwk * 100000)) storage_process(submission, str_metric_score, qwk, qwk_change, feature_score)