def fit(self, dframe): """ Fit label encoder to pandas columns. Access individual column classes via indexig `self.all_classes_` Access individual column encoders via indexing `self.all_encoders_` """ # if columns are provided, iterate through and get `classes_` if self.columns is not None: # ndarray to hold LabelEncoder().classes_ for each # column; should match the shape of specified `columns` self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object) self.all_encoders_ = np.ndarray(shape=self.columns.shape, dtype=object) for idx, column in enumerate(self.columns): # fit LabelEncoder to get `classes_` for the column le = LabelEncoder() le.fit(dframe.loc[:, column].values) # append the `classes_` to our ndarray container self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object)) # append this column's encoder self.all_encoders_[idx] = le else: # no columns specified; assume all are to be encoded self.columns = dframe.iloc[:, :].columns self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object) for idx, column in enumerate(self.columns): le = LabelEncoder() le.fit(dframe.loc[:, column].values) self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object)) self.all_encoders_[idx] = le return self
def encode(df, dump=fromPickle): """ Takes in: dataframe from clean_col Returns: a dataframe that LabelEncodes the categorical variables """ encoders=dict() for col in lblColumns: if col not in final_cols: continue le = LabelEncoder() if dump: fName="%s/%s.npy"%(modelPath,col) if os.path.isfile(fName): le.classes_=np.load(fName) else: le.fit(df[col]) np.save(fName, le.classes_) else: le.fit(df[col]) encoders[col]=le df[col] = le.transform(df[col]) # Order columns with logprice as the last column df = df[final_cols] df = df.reset_index().drop('index', axis = 1) return df
def test_vote_soft(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) train_probs = probs[0] test_probs = probs[1] print(len(train_probs)) for prob in train_probs: print(prob.shape) print(type(prob)) #train_attr = reduce(lambda a,b:a+b,train_probs) test_attr = reduce(lambda a,b:a+b,test_probs) pred = test_attr.idxmax(1) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred))
def customEncode(df): global labelencoder le = LabelEncoder() le.fit(df['OutcomeType']) df['OutcomeType'] = le.transform(df['OutcomeType']) labelencoder = le return df
def test_hard_vote(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) #train_probs = probs[0] test_probs = probs[1] print(len(test_probs)) preds = [x.idxmax(1) for x in test_probs] pred = np.zeros(len(preds[0]),dtype=np.int8) print(len(pred)) for i in range(len(preds[0])): votes = [p[i] for p in preds] print(votes) pred[i]= max(set(votes),key=votes.count) print(pred[i]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred)) """
def clean_device(df:pd.DataFrame): df['device_type'] = df['device_type'].map(lambda x: str(x).lower()) unique = np.unique(df.device_type.values) max_num_devices = len(unique) print("Unique Devices : ", unique) print("Num Unique Devices : ", max_num_devices) print() devices = df['device_type'].values if os.path.exists('data/devices.pkl'): with open('data/devices.pkl', 'rb') as f: encoder = pickle.load(f) else: encoder = LabelEncoder() encoder.fit(devices) with open('data/devices.pkl', 'wb') as f: pickle.dump(encoder, f) # encode the values devices = encoder.transform(devices) return df, devices, max_num_devices
def to_numeric(self, columns=[]): le = LabelEncoder() for i, c in enumerate(columns): le.fit(self.M[:, c]) self.M[:, c] = le.transform(self.M[:, c]) self.M = self.M.astype(np.float) return self
def convert_columns_to_binary(self): """Converts all columns with two elements into a binary column. """ # creating panel panel = pd.concat([self.train, self.test], ignore_index=True) change = False # converting two-element columns to binary column for colname in self.train.columns: if len(np.unique(self.train[colname].values.astype("str"))) == 2: if not all(np.unique(self.train[colname].values.astype("str")) == ["0","1"]): label = LabelEncoder() label.fit(list(panel[colname].values.astype("str"))) panel[colname] = label.transform(list(panel[colname].values.astype("str"))) change = True print("Column %s converted to binary" % (colname)) if not change: print("\nNo binary columns in data") self.train, self.test = panel.loc[0:len(self.train)-1,], panel.loc[len(self.train):len(panel)-1,] print("")
def clean_country(df:pd.DataFrame): df['country'] = df['country'].map(lambda x: str(x).upper()) unique = np.unique(df.country.values) max_num_countries = len(unique) print("Unique Countries : ", unique) print("Num Unique Countries : ", max_num_countries) print() countries = df['country'].values if os.path.exists('data/countries.pkl'): with open('data/countries.pkl', 'rb') as f: encoder = pickle.load(f) else: encoder = LabelEncoder() encoder.fit(countries) with open('data/countries.pkl', 'wb') as f: pickle.dump(encoder, f) # encode the values countries = encoder.transform(countries) return df, countries, max_num_countries
def select_with_forest(X, y, n_trees=10, treshold=0.01): from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import ExtraTreesClassifier import pandas as pd import numpy as np # encode labels (str -> int): le = LabelEncoder() X = X.copy() for col in X.columns: le.fit(X[col].unique()) X[col] = le.transform(X[col]) # train the classifier: forest = ExtraTreesClassifier(criterion="entropy", n_estimators=n_trees) forest.fit(X, y) print('number of selected features: ', np.sum(forest.feature_importances_ >= treshold)) # select important features: importances = pd.DataFrame() importances['predictor name'] = X.columns.tolist() importances['importance'] = forest.feature_importances_ importances = importances.sort_values(by='importance', ascending=False) #X2 = forest.transform(X, treshold) #labels2 = X.columns[list(forest.feature_importances_>=treshold)] #X2 = pd.DataFrame(X2) #X2.columns = labels2 return importances #X2
class OutputLabelColumn(BaseEstimator, TransformerMixin): ''' Take a string or key categorical column and transform it to integer labels. ''' def __init__(self): ''' Set up the internal transformation. ''' self._labeler = LabelEncoder() def fit(self, X, y=None): ''' Fit the label and encoding ''' handle_none = list(map(str, X)) self._labeler.fit(handle_none) return self def transform(self, X): ''' Transform a column of data into one hot encodings. Parameters ---------- X : pandas series or numpy array ''' handle_none = list(map(str, X)) return self._labeler.transform(handle_none).astype(np.int32)
def make_encoder(test, train): ''' creates a general label encoder for every unique value of all categorical variables ''' # load in the whole dataset # df1 = pd.read_csv('ssvout/0.ssv', na_values='None') # df1 = df1.append(pd.read_csv('ssvout/5000.ssv', na_values='None'), ignore_index = True) # df1 = df1.append(pd.read_csv('ssvout/10000.ssv', na_values='None'), ignore_index = True) # df1 = df1.append(pd.read_csv('ssvout/15000.ssv', na_values='None'), ignore_index = True) # df1 = df1.append(pd.read_csv('ssvout/20000.ssv', na_values='None'), ignore_index = True) # scrube the data # df1 = scrub(df1) # initialize the encoder and a list to store all categorical values df1 = train df1 = train.append(test, ignore_index = True) encoder = LabelEncoder() values = [] cols = [] for col in df1: cols.append(col) for value in df1[col]: values.append(value) encoder.fit(values) pickle.dump(encoder, file('./encoder.pkl', 'w'))
def fit(self, X, y=None): if self.categorical: # Need to one hot encode labels label_encoder = LabelEncoder() one_hot = OneHotEncoder() label_encoder.fit(y) one_hot.fit(list(map(lambda x:[x],label_encoder.transform(y)))) self.stack_encoder = lambda x: one_hot.transform(list(map(lambda x:[x],label_encoder.transform(x)))).toarray() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-self.hold_out_percent) predictions = [] for (name, clf) in self.base_classifiers: print("Ensemble currently fitting:",name) clf.fit(X_train,y_train) if self.categorical: predictions.append(self.stack_encoder(clf.predict(X_test))) else: predictions.append(list(map(lambda x:[x], clf.predict(X_test)))) predictions = np.hstack(predictions) print("Fitting stack classifier") self.stack_classifier[1].fit(predictions,y_test) if self.refit_base: for (name, clf) in self.base_classifiers: print("Ensemble currently refitting:",name) clf.fit(X,y) return self
def encode_categorical(data_dir,traindata,testdata, parameters): ''' Function to encode the categorical features as numerical features using a LabelEncoder. Important is to encode the test and train file in the same manner ''' #Unless pre-picked parameters to work on are given we work on these parameters. if parameters == None: names_parameters=['funder','installer','wpt_name','basin','subvillage','region','lga','ward','public_meeting','recorded_by' ,'scheme_management','scheme_name','permit','extraction_type','extraction_type_group','extraction_type_class','management','management_group','payment', 'payment_type','water_quality','quality_group','quantity','quantity_group','source','source_type','source_class', 'waterpoint_type','waterpoint_type_group'] else: names_parameters=parameters #Get the shape of the data (testrows, testcolumns)=testdata.shape (trainrows, traincolumns)=traindata.shape #For all of the specified features: for feature in names_parameters: #We initialize an encoder. le = LabelEncoder() #We add the test and train data together fitdata=np.append(traindata[feature].values,testdata[feature].values) #Because the mapping to numbers has to be the same in both, thus we fit them together. le.fit(fitdata) #We however want to work with the train and test data separate so we transform them separately train_cat=le.transform(traindata[feature]) test_cat = le.transform(testdata[feature]) #We initialize empty arrays for the train and test data and store these to the relevant files. newtestdata=np.zeros((testrows,1), dtype=int) newtestdata[:, 0]=np.array(test_cat) store_data(newtestdata, train=False,labels=(feature +'_num'), one=True) newtraindata=np.zeros((trainrows,1), dtype=int) newtraindata[:, 0]=np.array(train_cat) store_data(newtraindata, train=True,labels=(feature +'_num'), one=True)
def preprocess(): train = pd.read_csv("../data/train.csv") test = pd.read_csv("../data/test.csv") train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date'])) train = train.drop('Original_Quote_Date', axis=1) test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date'])) test = test.drop('Original_Quote_Date', axis=1) train['Year'] = train['Date'].apply(lambda x: x.year) train['Month'] = train['Date'].apply(lambda x: x.month) train['weekday'] = train['Date'].apply(lambda x: x.dayofweek) test['Year'] = test['Date'].apply(lambda x: x.year) test['Month'] = test['Date'].apply(lambda x: x.month) test['weekday'] = test['Date'].apply(lambda x: x.dayofweek) train = train.drop('Date', axis=1) test = test.drop('Date', axis=1) for f in train.columns: if train[f].dtype == 'object': lbl = LabelEncoder() # watch how to handle missing value labeling lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) train = train.fillna(-1) test = test.fillna(-1) return train, test
def main(): train = pd.read_csv('data/train.csv') test = pd.read_csv('data/test.csv') enc = LabelEncoder() joined = pd.concat((train['Product_Info_2'], test['Product_Info_2']), axis=0) enc.fit(joined) train['Product_Info_2'] = enc.transform(train['Product_Info_2']) test['Product_Info_2'] = enc.transform(test['Product_Info_2']) X_train = train.drop('Response', axis=1).values y_train = train['Response'].values X_test = test.values mdl = xgb.XGBRegressor(learning_rate=0.05, n_estimators=200, subsample=0.5, max_depth=6, silent=False) mdl.fit(X_train, y_train) preds = mdl.predict(X_test) preds = [min(max(1, int(round(pred))), 8) for pred in preds] sub = pd.DataFrame({'Id': test['Id'], 'Response': preds}) sub.to_csv('submissions/xgb.csv', index=False)
def __init__(self,filename='train.json'): self.filename_tr=filename # Read JSON data using pandas # columns are: id, cuisine, ingredients data = pd.read_json(filename) # Label Encoders labels = LabelEncoder() labels.fit(data.cuisine) self.classes = labels.classes_ self.class_encode = labels.transform self.class_decode = labels.inverse_transform # Get numerical labels for ytrain y_train = labels.transform(data.cuisine) # Vectorization of ingredients Using WordNet lemmatization & Tfid data['ingredients_clean_string'] = [' , '.join(z).strip() for z in data['ingredients']] data['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in data['ingredients']] vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,1), max_df=0.57, analyzer='word', token_pattern=r'\w+') x_train = vectorizer.fit_transform(data.ingredients_string).todense() ingred_dict = vectorizer.vocabulary_ self.vectorizer = vectorizer self.y_train = y_train self.x_train = x_train self.tsdata = pd.DataFrame()
def create_partitions(work_dir_path): import cPickle import numpy as np from sklearn.preprocessing import LabelEncoder from sklearn.cross_validation import StratifiedKFold # load data from one feature file npz = np.load("%s/features/features.ssd.npz" % (work_dir_path)) labels = npz["labels"] npz.close() # create labelencoder if not (os.path.exists("%s/labelencoder.pkl" % (work_dir_path)) and SKIP_EXISTING_FILES): le = LabelEncoder() le.fit(labels) with open("%s/labelencoder.pkl" % (work_dir_path), 'wb') as fid: cPickle.dump(le, fid) # create partitions if not (os.path.exists("%s/stratified_%dfold.pkl" % (work_dir_path, NUM_FOLDS)) and SKIP_EXISTING_FILES): cv = StratifiedKFold(le.transform(labels), n_folds=NUM_FOLDS, shuffle=False) with open("%s/stratified_%dfold.pkl" % (work_dir_path, NUM_FOLDS), 'wb') as fid: cPickle.dump(cv, fid)
def __init__(self, filename): with open(filename, "r") as f: lines = re.split(r'\n', f.read())[0:-1] data = [] target = [] for line in lines: fields = re.split(r',', line) data.append(fields[:-1]) target.append(fields[-1]) npdata = np.array(data) nptarget = np.array(target) # Scikit-learn requires numeric values. le = LabelEncoder() le.fit(nptarget) self.__target = le.transform(nptarget) nrows, ncols = npdata.shape self.__data = np.zeros((nrows, ncols), dtype = np.int64) for ix in xrange(ncols): col = npdata[:, ix] le.fit(col) self.__data[:, ix] = le.transform(col) # Most classifier results are skewed if categorical features are mapped # to integer-valued features. Use a one-hot encoding. oe = OneHotEncoder() oe.fit(self.__data) self.__data = oe.transform(self.__data).toarray()
def load_data(): train_list = [] for line in open('../data/train_clean.json', 'r'): train_list.append(json.loads(line)) train = pd.DataFrame(train_list) #train_work = train[names[-1]] test_list = [] for line in open('../data/test_clean.json', 'r'): test_list.append(json.loads(line)) test = pd.DataFrame(test_list) print('--- NLP on major, simply cut the first word') le = LabelEncoder() print len(set(train['major'])) train['major'] = train['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none') test['major'] = test['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none') print len(set(train['major'])) le.fit(list(train['major']) + list(test['major'])) train['major'] = le.transform(train['major']) test['major'] = le.transform(test['major']) le = LabelEncoder() train['gender'] = le.fit_transform(train['gender']) names = train.columns le = LabelEncoder() test['gender'] = le.fit_transform(test['gender']) del train['_id'] del test['_id'] train = train.fillna(0) test = test.fillna(0) #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii'))) return train, test
def _fit(self, df, logger): col_le_dict = {} for colname, col in df.iteritems(): le = LabelEncoder() le.fit(col) col_le_dict[colname] = le return col_le_dict
def data_processing(train,test,features): # train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0) # test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0) # train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x) # test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x) # train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12) # test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12) # train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0) # test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0) # features += ['hour','dark','StreetNo'] print("Filling NAs") # print(train.mode()) train = train.fillna(train.median().iloc[0]) test = test.fillna(test.median().iloc[0]) print("Label Encoder") le=LabelEncoder() for col in features: le.fit(list(train[col])+list(test[col])) train[col]=le.transform(train[col]) test[col]=le.transform(test[col]) le.fit(list(train[target])) train[target]=le.transform(train[target]) print("Standard Scalaer") scaler=StandardScaler() for col in features: scaler.fit(list(train[col])) train[col]=scaler.transform(train[col]) test[col]=scaler.transform(test[col]) return train,test,features
def label_encoder(data): for f in data.columns: if data[f].dtypes == 'object': lbl = LabelEncoder() lbl.fit(list(data[f].values)) data[f] = lbl.transform(list(data[f].values)) return data
class EncodingText(): """ LabelEncoder.transform的主要作用是把语料库的一个个单词转换成vocabulary对应的数字 LabelEncoder.fit的主要作用是去重,并把vocabulary里的单词映射到数字。 """ def __init__(self,vocabulary): from sklearn.preprocessing import LabelEncoder self.le=LabelEncoder() self.vocabulary=vocabulary def fit(self,X,y=None): self.le.fit(self.vocabulary) return self def transform(self,X): for x in X: y = self.le.transform(x.split()) return [self.le.transform(x.split()) for x in X] #return [self.getSparseM(x) for x in X] def getSparseM(self,x): from scipy.sparse import coo_matrix import numpy as np sent=x.split() ind=self.le.transform(sent) a=coo_matrix((np.ones([len(sent)]),(ind,range(len(sent)))),shape=(len(self.vocabulary),len(sent))) return a
def process_data(train,test,features,features_non_numeric): train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0) test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0) train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x) test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x) train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12) test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12) train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0) test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0) features += ['hour','dark','StreetNo'] print "Filling N/As: " + str(datetime.datetime.now()) train = train.fillna(train.mode().iloc[0]) test = test.fillna(test.mode().iloc[0]) # Pre-processing non-numberic values print "Label Encoder: " + str(datetime.datetime.now()) le = LabelEncoder() for col in features: # print col le.fit(list(train[col])+list(test[col])) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) # Xgb requires goal to be numeric... le.fit(list(train[goal])) train[goal] = le.transform(train[goal]) # Neural Network, Stochastic Gradient Descent is sensitive to feature scaling, so it is highly recommended to scale your data. print "Standard Scaler: " + str(datetime.datetime.now()) scaler = StandardScaler() for col in set(features): # - set(features_non_numeric): # print col scaler.fit(list(train[col])+list(test[col])) train[col] = scaler.transform(train[col]) test[col] = scaler.transform(test[col]) return (train,test,features)
def main(self, args=None): self.data = self.parser.parse_args(args=args) assert not self.data.update_klass best = load_json(self.data.params_fname) if isinstance(best, list): best = best[0] best = clean_params(best) print(self.data.params_fname, self.data.training_set) corpus, labels = read_data_labels(self.data.training_set) le = LabelEncoder() le.fit(labels) y = le.transform(labels) t = TextModel(corpus, **best) X = [t[x] for x in corpus] hy = [None for x in y] for tr, ts in KFold(n_splits=self.data.kratio, shuffle=True, random_state=self.data.seed).split(X): c = SVC(model=t) c.fit([X[x] for x in tr], [y[x] for x in tr]) _ = c.decision_function([X[x] for x in ts]) [hy.__setitem__(k, v) for k, v in zip(ts, _)] i = 0 with open(self.get_output(), 'w') as fpt: for tweet in tweet_iterator(self.data.training_set): tweet['decision_function'] = hy[i].tolist() i += 1 fpt.write(json.dumps(tweet)+"\n") return hy
def main(): org_type = pd.read_csv('../input/application_train.csv', usecols=['ORGANIZATION_TYPE'], nrows=None) print(org_type.shape) print(org_type.nunique()) print(org_type.head()) lbl = LabelEncoder() lbl.fit(org_type) org_type_label = lbl.transform(org_type) print(type(org_type_label)) print(org_type_label.shape) print(org_type_label[:5]) model = Sequential([ Dense(32, units=784), Activation('relu'), Dense(10), Activation('softmax'), ]) model.compile(loss='mean_absolute_error', optimizer='adam')
def encode_categorical_data(train, test, fill_missing = False): ''' encoding is an extemely slow process So only use the training data to trian the encoder ''' le = LabelEncoder() ## this step creates separate train and test dataFrame if fill_missing: train = train.fillna(value='missing') test = test.fillna(value='missing') counter = 0 start_time = time.time() for col, dtype in zip(train.columns, train.dtypes): if dtype == 'object': le.fit(pd.concat([train[col], test[col]], axis=0)) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) counter += 1 if counter % 20 == 0: print '{} out of {} is processed using {} seconds...'.format(str(counter), str(train.shape[1]), round((time.time() - start_time), 0)) end_time = time.time() print 'encoding process takes ', round((end_time - start_time)), 'seconds' ## train and test are newly created return train, test
def encode_dataset(train,test,meta,target_model='xgb'): y_train = train[meta['target']] train = train.drop([meta['target']],axis=1) assert train.shape[1] == test.shape[1] for i in range(train.shape[1]): assert train.columns[i] == test.columns[i] train_obs = len(train) # all_data = pd.concat([train,test],axis=0) for i,f in enumerate(meta['cols'].keys()): print(i,f,meta['cols'][f]) if meta['cols'][f] == 'CAT': all_data[f] = all_data[f].astype('str') encoder = LabelEncoder() encoder.fit(np.unique(all_data[f].unique().tolist())) if target_model == 'xgb': all_data[f] = encoder.transform(all_data[f]) else: all_data[f] = encoder.transform(all_data[f]).astype(int) elif meta['cols'][f] == 'NUM': all_data[f] = all_data[f].fillna(-1) elif meta['cols'][f] == 'REM': all_data = all_data.drop(f,axis=1) else: raise Exception(str(meta['cols'][f])+":unknown mapping") assert train_obs == len(y_train) return all_data , y_train
def label_encode_train_test_sets (train, test) : " Label encode 'supplier' and 'bracket_pricing' features for both train and test set " test_suppliers = np.sort(pd.unique(test.supplier.ravel())) print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers) train_suppliers = np.sort(pd.unique(train.supplier.ravel())) print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers) ## Merge 'supplier' for both datasets first because we want encoding to be consistent across both # http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html supplier_ids = [] supplier_ids.extend(train_suppliers) supplier_ids.extend(test_suppliers) supplier_ids = np.sort(np.unique(supplier_ids)) print ("Merged supplier_ids.shape: ", supplier_ids.shape) # print ("supplier_ids.elements: ", supplier_ids) ## Perform label encoding fit on the merged array and then individually transform for train and test sets print ("Performing label encoding on supplier column...") label_e = LabelEncoder() label_e.fit(supplier_ids) train['supplier'] = label_e.transform(train['supplier']) test['supplier'] = label_e.transform(test['supplier']) ## Perform label encoding on 'bracket_pricing' print ("Performing label encoding on bracket_pricing column...") train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing']) test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing']) return train, test
def t4_prepare_columns(train, test, good_columns_extra=None): good_columns = [ # 'bond_lengths_mean_y', # 'bond_lengths_median_y', # 'bond_lengths_std_y', # 'bond_lengths_mean_x', 'molecule_atom_index_0_dist_min', 'molecule_atom_index_0_dist_max', 'molecule_atom_index_1_dist_min', 'molecule_atom_index_0_dist_mean', 'molecule_atom_index_0_dist_std', 'dist', 'dist_lin', 'subtype', 'molecule_atom_index_1_dist_std', 'molecule_atom_index_1_dist_max', 'molecule_atom_index_1_dist_mean', 'molecule_atom_index_0_dist_max_diff', 'molecule_atom_index_0_dist_max_div', 'molecule_atom_index_0_dist_std_diff', 'molecule_atom_index_0_dist_std_div', 'atom_0_couples_count', 'molecule_atom_index_0_dist_min_div', 'molecule_atom_index_1_dist_std_diff', 'molecule_atom_index_0_dist_mean_div', 'atom_1_couples_count', 'molecule_atom_index_0_dist_mean_diff', 'molecule_couples', 'atom_index_1', 'molecule_dist_mean', 'molecule_atom_index_1_dist_max_diff', 'molecule_atom_index_0_y_1_std', 'molecule_atom_index_1_dist_mean_diff', 'molecule_atom_index_1_dist_std_div', 'molecule_atom_index_1_dist_mean_div', 'molecule_atom_index_1_dist_min_diff', 'molecule_atom_index_1_dist_min_div', 'molecule_atom_index_1_dist_max_div', 'molecule_atom_index_0_z_1_std', 'y_0', 'molecule_type_dist_std_diff', 'molecule_atom_1_dist_min_diff', 'molecule_atom_index_0_x_1_std', 'molecule_dist_min', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_y_1_mean_diff', 'molecule_type_dist_min', 'molecule_atom_1_dist_min_div', 'atom_index_0', 'molecule_dist_max', 'molecule_atom_1_dist_std_diff', 'molecule_type_dist_max', 'molecule_atom_index_0_y_1_max_diff', 'molecule_type_0_dist_std_diff', 'molecule_type_dist_mean_diff', 'molecule_atom_1_dist_mean', 'molecule_atom_index_0_y_1_mean_div', 'molecule_type_dist_mean_div', 'type', # Crane 'dist_C_0_a0', 'dist_C_1_a0', 'dist_C_2_a0', 'dist_C_3_a0', 'dist_C_4_a0', 'dist_F_0_a0', 'dist_F_1_a0', 'dist_F_2_a0', 'dist_F_3_a0', 'dist_F_4_a0', 'dist_H_0_a0', 'dist_H_1_a0', 'dist_H_2_a0', 'dist_H_3_a0', 'dist_H_4_a0', 'dist_N_0_a0', 'dist_N_1_a0', 'dist_N_2_a0', 'dist_N_3_a0', 'dist_N_4_a0', 'dist_O_0_a0', 'dist_O_1_a0', 'dist_O_2_a0', 'dist_O_3_a0', 'dist_O_4_a0', 'EN_a0', 'rad_a0', 'n_bonds_a0', 'bond_lengths_mean_a0', 'bond_lengths_std_a0', 'bond_lengths_median_a0', 'dist_C_0_a1', 'dist_C_1_a1', 'dist_C_2_a1', 'dist_C_3_a1', 'dist_C_4_a1', 'dist_F_0_a1', 'dist_F_1_a1', 'dist_F_2_a1', 'dist_F_3_a1', 'dist_F_4_a1', 'dist_H_0_a1', 'dist_H_1_a1', 'dist_H_2_a1', 'dist_H_3_a1', 'dist_H_4_a1', 'dist_N_0_a1', 'dist_N_1_a1', 'dist_N_2_a1', 'dist_N_3_a1', 'dist_N_4_a1', 'dist_O_0_a1', 'dist_O_1_a1', 'dist_O_2_a1', 'dist_O_3_a1', 'dist_O_4_a1', 'EN_a1', 'rad_a1', 'n_bonds_a1', 'bond_lengths_mean_a1', 'bond_lengths_std_a1', 'bond_lengths_median_a1', # Criskiev 'atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8', 'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0', 'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3', # Criskiev extra # 'd_1_0_log', 'd_2_0_log', 'd_2_1_log', 'd_3_0_log', 'd_3_1_log', 'd_3_2_log', 'd_4_0_log', 'd_4_1_log', # 'd_4_2_log', 'd_4_3_log', 'd_5_0_log', 'd_5_1_log', 'd_5_2_log', 'd_5_3_log', 'd_6_0_log', 'd_6_1_log', # 'd_6_2_log', 'd_6_3_log', 'd_7_0_log', 'd_7_1_log', 'd_7_2_log', 'd_7_3_log', 'd_8_0_log', 'd_8_1_log', # 'd_8_2_log', 'd_8_3_log', 'd_9_0_log', 'd_9_1_log', 'd_9_2_log', 'd_9_3', # # 'd_1_0_recp', 'd_2_0_recp', 'd_2_1_recp', 'd_3_0_recp', 'd_3_1_recp', 'd_3_2_recp', 'd_4_0_recp', 'd_4_1_recp', # 'd_4_2_recp', 'd_4_3_recp', 'd_5_0_recp', 'd_5_1_recp', 'd_5_2_recp', 'd_5_3_recp', 'd_6_0_recp', 'd_6_1_recp', # 'd_6_2_recp', 'd_6_3_recp', 'd_7_0_recp', 'd_7_1_recp', 'd_7_2_recp', 'd_7_3_recp', 'd_8_0_recp', 'd_8_1_recp', # 'd_8_2_recp', 'd_8_3_recp', 'd_9_0_recp', 'd_9_1_recp', 'd_9_2_recp', 'd_9_3' ] good_columns += (good_columns_extra if good_columns_extra is not None else []) labels = {} for f in ['atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) labels[f] = lbl X = train[good_columns].copy() X_test = test[good_columns].copy() return X, X_test, labels
def get_tag_dict(raw): label_encoder = LabelEncoder() label_encoder.fit(raw['tag']) return label_encoder
def myfunc(myDataset, shape, lang): myDataset = Remove3Row(myDataset) print(myDataset.head()) X = myDataset.Text Y = myDataset.Label encoder = LabelEncoder() encoder.fit(Y) Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=1, stratify=Y) Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.25, random_state=1, stratify=Ytrain) Ytrain = encoder.transform(Ytrain) Ytest = encoder.transform(Ytest) Yval = encoder.transform(Yval) Ytrain = np_utils.to_categorical(Ytrain) Ytest = np_utils.to_categorical(Ytest) Yval = np_utils.to_categorical(Yval) tokenizer = Tokenizer(num_words=5000) tokenizer.fit_on_texts(Xtrain) Xtrain = tokenizer.texts_to_sequences(Xtrain) Xtest = tokenizer.texts_to_sequences(Xtest) Xval = tokenizer.texts_to_sequences(Xval) vocab_size = len( tokenizer.word_index) + 1 # Adding 1 because of reserved 0 index maxlen = 100 Xtrain = pad_sequences(Xtrain, padding='post', maxlen=maxlen) Xtest = pad_sequences(Xtest, padding='post', maxlen=maxlen) Xval = pad_sequences(Xval, padding='post', maxlen=maxlen) from keras.models import Sequential from keras import layers embedding_dim = 50 model = Sequential() model.add( layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen)) model.add(GRU(256)) model.add(Dropout(0.1)) model.add(layers.Dense(shape, activation='sigmoid')) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() history = model.fit(Xtrain, Ytrain, epochs=10, verbose=True, validation_data=(Xval, Yval)) f = open("report.csv", "a") loss, accuracy = model.evaluate(Xtrain, Ytrain, verbose=True) print("Training Accuracy: {:.4f}".format(accuracy)) f.write(lang + ",{:.4f}".format(accuracy)) loss, accuracy = model.evaluate(Xval, Yval, verbose=True) print("Validation Accuracy: {:.4f}".format(accuracy)) f.write(",{:.4f} \n".format(accuracy)) f.close() plot_history(history) Ypred = model.predict(Xtest) from sklearn.metrics import confusion_matrix, classification_report matrix = confusion_matrix(Ytest.argmax(axis=1), Ypred.argmax(axis=1)) classification_Report = classification_report(Ytest.argmax(axis=1), Ypred.argmax(axis=1), output_dict=True) df = pd.DataFrame(classification_Report).transpose() df.to_csv(lang + "ClassificationReport.csv")
return indices #choose which size dataset and number of classes to use X_train = pickle.load( open("../generate_train_test/final_dataset/X_train.p",'rb')) X_test = pickle.load( open("../generate_train_test/final_dataset/X_test.p",'rb')) y_train = pickle.load( open("../generate_train_test/final_dataset/y_train.p",'rb')) y_test = pickle.load( open("../generate_train_test/final_dataset/y_test.p",'rb')) #split into train and validation X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.25, random_state=0) start_time = time.time() #numerically encode the classes le = LabelEncoder() le.fit(y_train) le_y_train = le.transform(y_train) le_y_validate = le.transform(y_validate) le_y_test = le.transform(y_test) #create TFIDF features from text tf_transformer = TfidfVectorizer(stop_words='english').fit(X_train) X_train_tfidf = tf_transformer.transform(X_train) X_validate_tfidf = tf_transformer.transform(X_validate) #X_test_tfidf = tf_transformer.transform(X_test) time_elapsed = time.time() - start_time print "TFIDF: " + str(time_elapsed/60) + " minutes" start_time = time.time()
def preprocessing_features(df_train, df_test, process_continuous): to_delete_features = ['default', 'pdays'] continuous_features = [ 'age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed' ] categorical_ordered_features = [ 'education', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ] categorical_unordered_features = ['job', 'marital'] unknown_present_features = [ 'job', 'marital', 'education', 'housing', 'loan' ] all_present_features = [ 'age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'contact', 'month', 'day_of_week', 'poutcome' ] ### Delete Features for feat in to_delete_features: print "\n--------- deleting feature --------- ", feat del df_train[feat] del df_test[feat] ### Normalization or Standardization of Continuous Features if process_continuous == "Standardize": print "\n--------- Standardizing Continuous Features (Mean=0, Standard Deviation=1) --------- " standardization = StandardScaler() standardization.fit(df_train[continuous_features]) print "Mean: ", standardization.mean_ print "Variance: ", standardization.var_ df_train[continuous_features] = standardization.transform( df_train[continuous_features]) df_test[continuous_features] = standardization.transform( df_test[continuous_features]) elif process_continuous == "Normalize": print "\n--------- Normalizing Continuous Features (Min=0, Max=1) --------- " min_max_scaling = MinMaxScaler() min_max_scaling.fit(df_train[continuous_features]) print min_max_scaling.data_min_ print min_max_scaling.data_max_ df_train[continuous_features] = min_max_scaling.transform( df_train[continuous_features]) df_test[continuous_features] = min_max_scaling.transform( df_test[continuous_features]) ### Label Categorical Ordered Features -- Features used for Imputation (All Present) label_dict = { 'education': { 'illiterate': 0, 'basic.4y': 4, 'basic.6y': 6, 'basic.9y': 9, 'high.school': 11, 'professional.course': 13, 'university.degree': 14 }, 'housing': { 'no': 0, 'yes': 1 }, 'loan': { 'no': 0, 'yes': 1 }, 'contact': { 'telephone': 0, 'cellular': 1 }, 'month': { 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12 }, 'day_of_week': { 'mon': 1, 'tue': 2, 'wed': 3, 'thu': 4, 'fri': 5, 'sat': 6, 'sun': 7 }, 'poutcome': { 'nonexistent': 0, 'failure': 1, 'success': 2 } } for feat in categorical_ordered_features: if feat not in unknown_present_features: print "\n--------- Labelling feature Before Imputation --------- ", feat df_train = df_train.replace({feat: label_dict[feat]}) df_test = df_test.replace({feat: label_dict[feat]}) print "Labelled as: ", label_dict[feat] ### Imputation using SVM df_train_impute = df_train.loc[:, df_train.columns.isin(all_present_features)] df_test_impute = df_test.loc[:, df_test.columns.isin(all_present_features)] for feat in unknown_present_features: print "\nFilling Unkowns for Feature: ", feat train_impute = df_train[feat] test_impute = df_test[feat] train_impute_no_unknowns = train_impute[train_impute != 'unknown'] train_impute_unknowns = train_impute[train_impute == 'unknown'] test_impute_unknowns = test_impute[test_impute == 'unknown'] df_train_impute_train_features = df_train_impute.loc[ train_impute_no_unknowns.index] df_train_impute_test_features = df_train_impute.loc[ train_impute_unknowns.index] df_test_impute_test_features = df_test_impute.loc[ test_impute_unknowns.index] svm_model = SVC() svm_model.fit(df_train_impute_train_features, train_impute_no_unknowns) df_train.loc[df_train_impute_test_features.index, feat] = svm_model.predict(df_train_impute_test_features) print "Train Filled with: ", df_train.loc[ df_train_impute_test_features.index, feat].value_counts() df_test.loc[df_test_impute_test_features.index, feat] = svm_model.predict(df_test_impute_test_features) print "Test Filled with: ", df_test.loc[ df_test_impute_test_features.index, feat].value_counts() ### Label Categorical Ordered Features -- Features Imputated (Unkowns were Present) for feat in categorical_ordered_features: if feat in unknown_present_features: print "\n--------- Labelling feature After Imputation --------- ", feat df_train = df_train.replace({feat: label_dict[feat]}) df_test = df_test.replace({feat: label_dict[feat]}) print "Labelled as: ", label_dict[feat] ### One hot encoding Categorical Un-ordered Features for feat in categorical_unordered_features: print "\n--------- One Hot Encoding feature --------- ", feat label_encoder = LabelEncoder() label_encoder.fit(df_train[feat]) df_train[feat] = label_encoder.transform(df_train[feat]) df_test[feat] = label_encoder.transform(df_test[feat]) one_hot_encoder = OneHotEncoder(sparse=False) one_hot_encoder.fit(df_train[categorical_unordered_features]) one_hot_encoded_array_train = one_hot_encoder.transform( df_train[categorical_unordered_features]) one_hot_encoded_df_train = pd.DataFrame(one_hot_encoded_array_train, index=df_train.index) one_hot_encoded_array_test = one_hot_encoder.transform( df_test[categorical_unordered_features]) one_hot_encoded_df_test = pd.DataFrame(one_hot_encoded_array_test, index=df_test.index) df_train = pd.concat( [df_train, one_hot_encoded_df_train], axis=1) #concatenate old columns with new one hot encoded columns df_test = pd.concat( [df_test, one_hot_encoded_df_test], axis=1) #concatenate old columns with new one hot encoded columns df_train = df_train.drop( categorical_unordered_features, axis=1) #Delete columns which were one hot encoded df_test = df_test.drop(categorical_unordered_features, axis=1) #Delete columns which were one hot encoded ### Return pre-processed df return df_train, df_test
del train1 gc.collect() print('click',data['click'].unique()) data=data.fillna(-1) encoder=['adid', 'advert_id', 'advert_industry_inner', 'advert_name', 'app_cate_id', 'app_id', 'campaign_id', 'carrier', 'city',] col_encoder = LabelEncoder() for feat in encoder: col_encoder.fit(data[feat]) data[feat] = col_encoder.transform(data[feat]) minv = np.int32(0); maxv = np.int32(5); totalLength = 1#两个类别:根据损失函数类型决定是否,要把做成one-hot形式 batch_size = 10000; graph = tf.Graph()
for i, t in enumerate(train['type'].unique()): plt.subplot(2, 4, i + 1) plt.scatter(train.loc[train['type'] == t, good_columns[0]], train.loc[train['type'] == t, 'scalar_coupling_constant'], label=t) plt.title(f'{good_columns[0]} vs target \n for {t} type') fig, ax = plt.subplots(figsize=(20, 10)) for i, t in enumerate(train['type'].unique()): plt.subplot(2, 4, i + 1) plt.hist(train.loc[train['type'] == t, good_columns[0]], label='train') plt.hist(test.loc[test['type'] == t, good_columns[0]], label='test') plt.title(f'{good_columns[0]} distribution \n for {t} type') for f in ['atom_1', 'type_0', 'type']: if f in good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) X = train[good_columns].copy() y = train['scalar_coupling_constant'] X_test = test[good_columns].copy() del train, test gc.collect() n_fold = 3 folds = KFold(n_splits=n_fold, shuffle=True, random_state=11) params = { 'num_leaves': 128, 'min_child_samples': 79, 'objective': 'regression', 'max_depth': 9, 'learning_rate': 0.2,
from sklearn.datasets import load_iris from sklearn.cluster import KMeans import pandas as pd from sklearn.model_selection import train_test_split from keras.models import Sequential from keras.layers import Dense from keras.utils import np_utils from sklearn.preprocessing import LabelEncoder dataset = load_iris() X = dataset.data Y = dataset.target LE = LabelEncoder() LE.fit(Y) e_Y = LE.transform(Y) Y = np_utils.to_categorical(e_Y) X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8) model = Sequential() model.add(Dense(7,input_dim=4,activation='sigmoid')) model.add(Dense(3,activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) import matplotlib.pyplot as plt history = model.fit(X_train,Y_train,epochs=20,batch_size=5,validation_split=0.2)
def sensitivity_specificity_support(y_true, y_pred, labels=None, pos_label=1, average=None, warn_for=('sensitivity', 'specificity'), sample_weight=None): """Compute sensitivity, specificity, and support for each class The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The sensitivity quantifies the ability to avoid false negatives_[1]. The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number of true negatives and ``fn`` the number of false negatives. The specificity quantifies the ability to avoid false positives_[1]. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average sensitivity and specificity if ``average`` is one of ``'weighted'``. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=None) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : ndarray, shape (n_samples, ) Sample weights. Returns ------- sensitivity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) specificity : float (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) support : int (if ``average`` = None) or ndarray, \ shape (n_unique_labels, ) The number of occurrences of each label in ``y_true``. References ---------- .. [1] `Wikipedia entry for the Sensitivity and specificity <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_ Examples -------- >>> import numpy as np >>> from imblearn.metrics import sensitivity_specificity_support >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> sensitivity_specificity_support(y_true, y_pred, average='macro') (0.33333333333333331, 0.66666666666666663, None) >>> sensitivity_specificity_support(y_true, y_pred, average='micro') (0.33333333333333331, 0.66666666666666663, None) >>> sensitivity_specificity_support(y_true, y_pred, average='weighted') (0.33333333333333331, 0.66666666666666663, None) """ average_options = (None, 'micro', 'macro', 'weighted', 'samples') if average not in average_options and average != 'binary': raise ValueError('average has to be one of ' + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) if average == 'binary': if y_type == 'binary': if pos_label not in present_labels: if len(present_labels) < 2: # Only negative labels return (0., 0., 0) else: raise ValueError("pos_label=%r is not a valid label: %r" % (pos_label, present_labels)) labels = [pos_label] else: raise ValueError("Target is %s but average='binary'. Please " "choose another average setting." % y_type) elif pos_label not in (None, 1): warnings.warn("Note that pos_label (set to %r) is ignored when " "average != 'binary' (got %r). You may use " "labels=[pos_label] to specify a single positive class." % (pos_label, average), UserWarning) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack( [labels, np.setdiff1d( present_labels, labels, assume_unique=True)]) # Calculate tp_sum, pred_sum, true_sum ### if y_type.startswith('multilabel'): raise ValueError('imblearn does not support multilabel') elif average == 'samples': raise ValueError("Sample-based precision, recall, fscore is " "not meaningful outside multilabel " "classification. See the accuracy_score instead.") else: le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount( tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = pred_sum = tp_sum = np.zeros(len(labels)) if len(y_pred): pred_sum = np.bincount( y_pred, weights=sample_weight, minlength=len(labels)) if len(y_true): true_sum = np.bincount( y_true, weights=sample_weight, minlength=len(labels)) # Compute the true negative tn_sum = y_true.size - (pred_sum + true_sum - tp_sum) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] tn_sum = tn_sum[indices] if average == 'micro': tp_sum = np.array([tp_sum.sum()]) pred_sum = np.array([pred_sum.sum()]) true_sum = np.array([true_sum.sum()]) tn_sum = np.array([tn_sum.sum()]) # Finally, we have all our sufficient statistics. Divide! # with np.errstate(divide='ignore', invalid='ignore'): # Divide, and on zero-division, set scores to 0 and warn: # Oddly, we may get an "invalid" rather than a "divide" error # here. specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum, 'specificity', 'predicted', average, warn_for) sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true', average, warn_for) # Average the results if average == 'weighted': weights = true_sum if weights.sum() == 0: return 0, 0, None elif average == 'samples': weights = sample_weight else: weights = None if average is not None: assert average != 'binary' or len(specificity) == 1 specificity = np.average(specificity, weights=weights) sensitivity = np.average(sensitivity, weights=weights) true_sum = None # return no support return sensitivity, specificity, true_sum
y, docs = [], [] for i, line in enumerate(data.split("\n")): content = line.split("\t") docs.append(content[0]) y.append(content[1]) #%% #============================================================================== # Encode class values as integers #============================================================================== encoder = LabelEncoder() encoder.fit(y) encoded_y = encoder.transform(y) # convert integers to dummy variables (i.e. one hot encoded) dummy_y = np_utils.to_categorical(encoded_y) #%% #============================================================================== # Define plot_history function #============================================================================== def plot_history(history): loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s] val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
import pandas as pd import numpy as np from collections import defaultdict from sklearn.model_selection import cross_val_score from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder dataset = pd.read_csv('NBA_2014_games.csv', parse_dates=["Date"]) encoding = LabelEncoder() encoding.fit(dataset["Home/Neutral"].values) home_teams = encoding.transform( dataset["Home/Neutral"].values) #能把字符串类型的球队名转化为整型 encoding.fit(dataset["Visitor/Neutral"].values) visitor_teams = encoding.transform(dataset["Visitor/Neutral"].values) X_teams = np.vstack([home_teams, visitor_teams]).T #向量组合 onehot = OneHotEncoder() X_teams_expanded = onehot.fit_transform(X_teams).todense() print(X_teams_expanded)
def geometric_mean_score(y_true, y_pred, labels=None, pos_label=1, average='multiclass', sample_weight=None, correction=0.0): """Compute the geometric mean The geometric mean (G-mean) is the root of the product of class-wise sensitivity. This measure tries to maximize the accuracy on each of the classes while keeping these accuracies balanced. For binary classification G-mean is the squared root of the product of the sensitivity and specificity. For multi-class problems it is a higher root of the product of sensitivity for each class. For compatibility with other imbalance performance measures, G-mean can be calculated for each class separately on a one-vs-rest basis when ``average != 'multiclass'``. The best value is 1 and the worst value is 0. Traditionally if at least one class is unrecognized by the classifier, G-mean resolves to zero. To alleviate this property, for highly multi-class the sensitivity of unrecognized classes can be "corrected" to be a user specified value (instead of zero). This option works only if ``average == 'multiclass'``. Parameters ---------- y_true : ndarray, shape (n_samples, ) Ground truth (correct) target values. y_pred : ndarray, shape (n_samples, ) Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. pos_label : str or int, optional (default=1) The class to report if ``average='binary'`` and the data is binary. If the data are multiclass, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : str or None, optional (default=``'multiclass'``) If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : ndarray, shape (n_samples, ) Sample weights. correction: float, optional (default=0.0) Substitutes sensitivity of unrecognized classes from zero to a given value. Returns ------- geometric_mean : float Notes ----- See :ref:`sphx_glr_auto_examples_evaluation_plot_metrics.py`. References ---------- .. [1] Kubat, M. and Matwin, S. "Addressing the curse of imbalanced training sets: one-sided selection" ICML (1997) .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies for learning in class imbalance problems", Pattern Recognition, 36(3), (2003), pp 849-851. Examples -------- >>> from imblearn.metrics import geometric_mean_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> geometric_mean_score(y_true, y_pred) 0.0 >>> geometric_mean_score(y_true, y_pred, correction=0.001) 0.010000000000000004 >>> geometric_mean_score(y_true, y_pred, average='macro') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average='micro') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average='weighted') 0.47140452079103168 >>> geometric_mean_score(y_true, y_pred, average=None) array([ 0.8660254, 0. , 0. ]) """ if average is None or average != 'multiclass': sen, spe, _ = sensitivity_specificity_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=('specificity', 'specificity'), sample_weight=sample_weight) LOGGER.debug('The sensitivity and specificity are : %s - %s' % (sen, spe)) return np.sqrt(sen * spe) else: present_labels = unique_labels(y_true, y_pred) if labels is None: labels = present_labels n_labels = None else: n_labels = len(labels) labels = np.hstack([labels, np.setdiff1d(present_labels, labels, assume_unique=True)]) le = LabelEncoder() le.fit(labels) y_true = le.transform(y_true) y_pred = le.transform(y_pred) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] if sample_weight is not None: tp_bins_weights = np.asarray(sample_weight)[tp] else: tp_bins_weights = None if len(tp_bins): tp_sum = np.bincount(tp_bins, weights=tp_bins_weights, minlength=len(labels)) else: # Pathological case true_sum = tp_sum = np.zeros(len(labels)) if len(y_true): true_sum = np.bincount(y_true, weights=sample_weight, minlength=len(labels)) # Retain only selected labels indices = np.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] recall = _prf_divide(tp_sum, true_sum, "recall", "true", None, "recall") recall[recall == 0] = correction gmean = sp.stats.gmean(recall) # old version of scipy return MaskedConstant instead of 0.0 if isinstance(gmean, np.ma.core.MaskedConstant): return 0.0 return gmean
def data_preprocess(df_train_, df_test_, delete_cols = [], drop_mask = []): print 'data_preprocess', '-' * 100 train = df_train_.copy() test = df_test_.copy() train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean') train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean') train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std') train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std') test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean') test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean') test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std') test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std') train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean') train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean') train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std') train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std') test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean') test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean') test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std') test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std') train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean') train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean') train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std') train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std') test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean') test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean') test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std') test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std') train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean') train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean') train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std') train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std') test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean') test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean') test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std') test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std') train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True) train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True) test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True) test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True) many_null_cols = [col for col in train.columns if train[col].isna().sum() * 1.0 / train.shape[0] > 0.9] many_null_cols_test = [col for col in test.columns if test[col].isna().sum() * 1.0 / test.shape[0] > 0.9] big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna = False, normalize = True).values[0] > 0.9] big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna = False, normalize = True).values[0] > 0.9] one_value_cols = [col for col in train.columns if train[col].nunique() <= 1] one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1] cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test + one_value_cols + one_value_cols_test)) cols_to_drop.remove('isFraud') print len(cols_to_drop) train = train.drop(cols_to_drop, axis = 1) test = test.drop(cols_to_drop, axis = 1) cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain', 'R_emaildomain', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9', 'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3'] for col in cat_cols: if col in train.columns: le = LabelEncoder() le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values)) train[col] = le.transform(list(train[col].astype(str).values)) test[col] = le.transform(list(test[col].astype(str).values)) X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1) y = train.sort_values('TransactionDT')['isFraud'] X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1) del train test = test[["TransactionDT", 'TransactionID']] # by https://www.kaggle.com/dimartinot def clean_inf_nan(df): return df.replace([np.inf, -np.inf], np.nan) # Cleaning infinite values to NaN X = clean_inf_nan(X) X_test = clean_inf_nan(X_test) return X, y, X_test
# create 5 models # 1 for each action # feed data data to all models and take action with highest score training_data = pd.read_csv( r'/home/tmo/Projects/dat19_footballrobots/06_Data_Science/labeled_data.csv' ) # this func replaces the categorical variables with 0 & 1 depending on the passed decision param training_data['decision'] = training_data['decision'].astype('category') training_data['player_id'] = training_data['player_id'].astype('category') training_data['mate_id'] = training_data['mate_id'].astype('category') training_data['team_id'] = training_data['team_id'].astype('category') le = LabelEncoder() le.fit(training_data['decision']) training_data['decision'] = le.transform(training_data['decision']) le.fit(training_data['player_id']) training_data['player_id'] = le.transform(training_data['player_id']) le.fit(training_data['mate_id']) training_data['mate_id'] = le.transform(training_data['mate_id']) le.fit(training_data['team_id']) training_data['team_id'] = le.transform(training_data['team_id']) #split up data & label features_training, labels_training = training_data.iloc[:, 0: -1], training_data.iloc[:, -1:]
#If training image contains exactly one face if len(face_bounding_boxes) == 1: face_enc = face_recognition.face_encodings(face)[0] # Add face encoding for current image with corresponding label (name) to the training data encodings.append(face_enc) names.append(person) else: print(person + "/" + person_img + " was skipped and can't be used for training") # normalize input vectors in_encoder = Normalizer(norm='l2') encodings = in_encoder.transform(encodings) # label encode targets out_encoder = LabelEncoder() out_encoder.fit(names) names = out_encoder.transform(names) # Create and train the SVC classifier clf = svm.SVC(gamma='scale', probability=True) #clf = svm.SVC(kernel='linear', probability=True) clf.fit(encodings, names) # Load the test image with unknown faces into a numpy array test_image = face_recognition.load_image_file('test/test.jpg') # Find all the faces in the test image using the default HOG-based model face_locations = face_recognition.face_locations(test_image) no = len(face_locations) print("Number of faces detected: ", no)
from sklearn.preprocessing import LabelEncoder from Decaf_feature import DecafFeature import pickle as pkl dataset = {} if len(sys.argv) == 1: print "Usage: classifier.py dataset_path" else: print "Loading", sys.argv[1] dataset = pkl.load(file(sys.argv[1], 'rb')) # 将类别名称编码为数字 le = LabelEncoder() all_labels = dataset.keys() le.fit(all_labels) # 训练样本比率 _percent = 0.5 x_train=[] x_test=[] y_train=[] y_test=[] for k,v in dataset.items(): print "Processing", k X = [] Y = [] for item in v: Y.append(k) X.append(item)
#train_set=train_set[colname] #reduce output to 'normal' and 'attack' test_labels=test_set['label'].copy() test_labels[test_labels!='normal']='attack' #~firstly, we are going to transform all categorical attibute to numeric attribute~ # preprocessing nomial features import numpy as np from sklearn.preprocessing import LabelEncoder,OneHotEncoder le = LabelEncoder() #preprocess 'service' attribute (having large number of unique value) #Methodology: calculating the ratio of 'attack''s records of one service then replace service's name with the calculated values value_list = le.fit(test_set['service']) for element in value_list.classes_: N = test_set[test_set['service']==element].shape[0]#number of instance equal to 'element' in 'service''s column Np = test_set[(test_set['service']==element) & (test_set['label']!='normal')].shape[0]#number of instance equal to 'element' in 'service''s column and have label of 'attack' A=Np/N test_set.loc[test_set['service']==element,'service']=A #preprocessing 'flag' and 'protocol_type' attribute (having few number of unique value) a = pd.get_dummies(test_set[['flag','protocol_type']]) test_set = pd.concat([test_set,a],axis=1) test_set=test_set.drop(['Unnamed: 0','flag','protocol_type'],axis=1) #train_set['flag']=le.fit_transform(train_set['flag']) #~/~ #exclude the 'label' attribute from set
# s = pd.Series(np.random.randn(100).cumsum()) # s.plot(linestyle='--', marker='.', color="r", grid=True) # plt.show() iris_tf = pd.read_csv("./iris.csv") X = iris_tf[['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width']].values y = iris_tf['Species'].values # print(iris_tf.info()) # print(iris_tf.describe()) # print(iris_tf['Species'].value_counts()) # print(iris_tf.head()) enc = LabelEncoder() label_encoder = enc.fit(y) y = label_encoder.transform(y) fig = plt.figure(figsize=(6, 6)) ax = fig.add_subplot(211) label_dict = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'} colors = ['blue', 'red', 'green'] markers = ['s', 'o', '^'] for lab, c, m in zip(range(3), colors, markers): ax.scatter( X[y == lab, 2], X[y == lab, 3], c=c, # color
def add_cat(data, col): enc = LabelEncoder() enc.fit(data[col]) data[col + "Cat"] = enc.transform(data[col])
set(df.columns) - set([dep_var])) # List of all the column names (not the target class) # Copy the dataframe for a hacky solution to provide the TabularList.from_df() method with a full dataset data = df.copy() # Rip out the target classification column and save it df_y = df[dep_var] del df[dep_var] # Encode and transform the data # Since the target classification is categorical, we need to have a way to interpret the neural network's output. # In this case, we are assigning each possible output with an integer 0..n-1, another option would be to use one-hot # encoding here. This may be explored further. encoder = LabelEncoder() encoder.fit(df_y) data_y = encoder.transform(df_y) # Normalize the x data and rename for consistency data_x = (df - df.mean()) / (df.max() - df.min()) data_x = data_x.values # Set up the metrics we want to collect. I wanted TP,TN,FP,FN but that wasn't available. Recall and precision are still # extremely helpful for evaluating the model metrics = [accuracy, Recall(), Precision()] # Keep track of which fold we are on fold_num = 1 total_folds = 10 # Get the indices for the fold and train on that fold
from deap import creator, base, tools, algorithms from scoop import futures import random import numpy from scipy import interpolate import matplotlib.pyplot as plt # Read in data from CSV # Data set from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing dfData = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.xls', sep=',') #dfData=pd.read_csv('bank-additional-full.csv', sep=';') # Encode the classification labels to numbers # Get classes and one hot encoded feature vectors le = LabelEncoder() le.fit(dfData['Churn']) allClasses = le.transform(dfData['Churn']) allFeatures = dfData.drop(['Churn'], axis=1) # Form training, test, and validation sets X_trainAndTest, X_validation, y_trainAndTest, y_validation = train_test_split( allFeatures, allClasses, test_size=0.20, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X_trainAndTest, y_trainAndTest, test_size=0.20, random_state=42) # Feature subset fitness function def getFitness(individual, X_train, X_test, y_train, y_test):
def main(): print('Using Keras version: ', keras.__version__) usage = 'usage: %prog [options]' parser = argparse.ArgumentParser(usage) parser.add_argument( '-t', '--train_model', dest='train_model', help= 'Option to train model or simply make diagnostic plots (0=False, 1=True)', default=1, type=int) parser.add_argument('-s', '--suff', dest='suffix', help='Option to choose suffix for training', default='', type=str) parser.add_argument('-p', '--para', dest='hyp_param_scan', help='Option to run hyper-parameter scan', default=0, type=int) parser.add_argument( '-i', '--inputs_file_path', dest='inputs_file_path', help= 'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.', default='', type=str) args = parser.parse_args() do_model_fit = args.train_model suffix = args.suffix # Create instance of the input files directory #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/' inputs_file_path = '' hyp_param_scan = args.hyp_param_scan # Set model hyper-parameters weights = 'BalanceYields' # 'BalanceYields' or 'BalanceNonWeighted' optimizer = 'Nadam' validation_split = 0.25 # hyper-parameter scan results if weights == 'BalanceNonWeighted': learn_rate = 0.0005 epochs = 200 batch_size = 200 if weights == 'BalanceYields': learn_rate = 0.0001 epochs = 600 batch_size = 500 #epochs = 10 #batch_size=200 # Create instance of output directory where all results are saved. output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights) check_dir(output_directory) hyperparam_file = os.path.join(output_directory, 'additional_model_hyper_params.txt') additional_hyperparams = open(hyperparam_file, 'w') additional_hyperparams.write("optimizer: " + optimizer + "\n") additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n") additional_hyperparams.write("epochs: " + str(epochs) + "\n") additional_hyperparams.write("validation_split: " + str(validation_split) + "\n") additional_hyperparams.write("weights: " + weights + "\n") # Create plots subdirectory plots_dir = os.path.join(output_directory, 'plots/') input_var_jsonFile = open('input_variables.json', 'r') #selection_criteria = '( 1.>0. )' selection_criteria = '( ( fabs(weight) < 10 ) )' # Load Variables from .json variable_list = json.load(input_var_jsonFile, encoding="utf-8").items() # Create list of headers for dataset .csv column_headers = [] for key, var in variable_list: column_headers.append(key) column_headers.append('weight') #column_headers.append('kinWeight') #column_headers.append('weight_NLO_node') column_headers.append('unweighted') column_headers.append('target') column_headers.append('key') column_headers.append('classweight') column_headers.append('process_ID') # Load ttree into .csv including all variables listed in column_headers print('<train-DNN> Input file path: ', inputs_file_path) outputdataframe_name = '%s/output_dataframe.csv' % (output_directory) if os.path.isfile(outputdataframe_name): data = pandas.read_csv(outputdataframe_name) print('<train-DNN> Loading data .csv from: %s . . . . ' % (outputdataframe_name)) else: print('<train-DNN> Creating new data .csv @: %s . . . . ' % (inputs_file_path)) data = load_data(inputs_file_path, column_headers, selection_criteria) # Change sentinal value to speed up training. data = data.mask(data < -25., -9.) data = data.mask(data == np.inf, -9.) data = data.mask(data == -np.inf, -9.) data = data.mask(data == np.nan, -9.) data_inf = data.isin([np.inf, -np.inf]) data_nan = data.isin([np.nan]) count_inf = np.isinf(data_inf).values.sum() count_nan = np.isinf(data_nan).values.sum() if count_inf > 0: print "WARNING ---> It contained " + str( count_inf) + " infinite values" if count_nan > 0: print "WARNING ---> It contained " + str(count_nan) + " NaN values" data.to_csv(outputdataframe_name, index=False) data = pandas.read_csv(outputdataframe_name) print('<main> data columns: ', (data.columns.values.tolist())) n = len(data) nHH = len(data.iloc[data.target.values == 1]) nbckg = len(data.iloc[data.target.values == 0]) print("Total (train+validation) length of HH = %i, bckg = %i" % (nHH, nbckg)) # Make instance of plotter tool Plotter = plotter() # Create statistically independant training/testing data traindataset, valdataset = train_test_split(data, test_size=0.25) valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False) print('<train-DNN> Training dataset shape: ', traindataset.shape) print('<train-DNN> Validation dataset shape: ', valdataset.shape) # Event weights weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'weight'] #weights_for_HH_NLO = traindataset.loc[traindataset['process_ID']=='HH', 'weight_NLO_node'] weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'weight'] weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'weight'] weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'weight'] weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'weight'] weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'weight'] weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'weight'] weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'weight'] weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'weight'] HHsum_weighted = sum(weights_for_HH) Hggsum_weighted = sum(weights_for_Hgg) DiPhotonsum_weighted = sum(weights_for_DiPhoton) GJetsum_weighted = sum(weights_for_GJet) QCDsum_weighted = sum(weights_for_QCD) DYsum_weighted = sum(weights_for_DY) TTGsJetssum_weighted = sum(weights_for_TTGsJets) WGsJetssum_weighted = sum(weights_for_WGsJets) WWsum_weighted = sum(weights_for_WW) bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH', 'unweighted'] nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg', 'unweighted'] nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] == 'DiPhoton', 'unweighted'] nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet', 'unweighted'] nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD', 'unweighted'] nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY', 'unweighted'] nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] == 'TTGsJets', 'unweighted'] nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] == 'WGsJets', 'unweighted'] nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW', 'unweighted'] HHsum_unweighted = sum(nevents_for_HH) Hggsum_unweighted = sum(nevents_for_Hgg) DiPhotonsum_unweighted = sum(nevents_for_DiPhoton) GJetsum_unweighted = sum(nevents_for_GJet) QCDsum_unweighted = sum(nevents_for_QCD) DYsum_unweighted = sum(nevents_for_DY) TTGsJetssum_unweighted = sum(nevents_for_TTGsJets) WGsJetssum_unweighted = sum(nevents_for_WGsJets) WWsum_unweighted = sum(nevents_for_WW) bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted #HHsum_weighted= HHsum_weighted*2. HHsum_unweighted = HHsum_unweighted * 2. if weights == 'BalanceYields': print('HHsum_weighted= ', HHsum_weighted) print('Hggsum_weighted= ', Hggsum_weighted) print('DiPhotonsum_weighted= ', DiPhotonsum_weighted) print('GJetsum_weighted= ', GJetsum_weighted) print('QCDsum_weighted= ', QCDsum_weighted) print('DYsum_weighted= ', DYsum_weighted) print('TTGsJetssum_weighted= ', TTGsJetssum_weighted) print('WGsJetssum_weighted= ', WGsJetssum_weighted) print('WWsum_weighted= ', WWsum_weighted) print('bckgsum_weighted= ', bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = HHsum_unweighted / HHsum_weighted traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_weighted) if weights == 'BalanceNonWeighted': print('HHsum_unweighted= ', HHsum_unweighted) print('Hggsum_unweighted= ', Hggsum_unweighted) print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted) print('GJetsum_unweighted= ', GJetsum_unweighted) print('QCDsum_unweighted= ', QCDsum_unweighted) print('DYsum_unweighted= ', DYsum_unweighted) print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted) print('WGsJetssum_unweighted= ', WGsJetssum_unweighted) print('WWsum_unweighted= ', WWsum_unweighted) print('bckgsum_unweighted= ', bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'HH', ['classweight']] = 1. traindataset.loc[traindataset['process_ID'] == 'Hgg', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DiPhoton', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'GJet', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'QCD', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'DY', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'TTGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WGsJets', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) traindataset.loc[traindataset['process_ID'] == 'WW', ['classweight']] = (HHsum_unweighted / bckgsum_unweighted) # Remove column headers that aren't input variables training_columns = column_headers[:-6] print('<train-DNN> Training features: ', training_columns) column_order_txt = '%s/column_order.txt' % (output_directory) column_order_file = open(column_order_txt, "wb") for tc_i in training_columns: line = tc_i + "\n" pickle.dump(str(line), column_order_file) num_variables = len(training_columns) # Extract training and testing data X_train = traindataset[training_columns].values X_test = valdataset[training_columns].values # Extract labels data Y_train = traindataset['target'].values Y_test = valdataset['target'].values # Create dataframe containing input features only (for correlation matrix) train_df = data.iloc[:traindataset.shape[0]] # Event weights if wanted #train_weights = traindataset['weight'].values*traindataset['weight_NLO_node'].values #test_weights = valdataset['weight'].values*valdataset['weight_NLO_node'].values #train_weights = abs(traindataset['weight'].values)*abs(traindataset['weight_NLO_node'].values) #test_weights = abs(valdataset['weight'].values)*abs(valdataset['weight_NLO_node'].values) #train_weights = abs(traindataset['weight'].values)*abs(traindataset['kinWeight'].values)*abs(traindataset['weight_NLO_node'].values) #test_weights = abs(valdataset['weight'].values)*abs(valdataset['kinWeight'].values)*abs(valdataset['weight_NLO_node'].values) train_weights = abs(traindataset['weight'].values) test_weights = abs(valdataset['weight'].values) # Weights applied during training. if weights == 'BalanceYields': #trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight']*traindataset.loc[:,'weight_NLO_node'] #trainingweights = traindataset.loc[:,'classweight'].abs()*traindataset.loc[:,'weight'].abs()*traindataset.loc[:,'weight_NLO_node'].abs() #trainingweights = traindataset.loc[:,'classweight'].abs() * traindataset.loc[:,'weight'].abs() * traindataset.loc[:,'kinWeight'].abs() * traindataset.loc[:,'weight_NLO_node'].abs() trainingweights = traindataset.loc[:, 'classweight'].abs( ) * traindataset.loc[:, 'weight'].abs() if weights == 'BalanceNonWeighted': trainingweights = traindataset.loc[:, 'classweight'] trainingweights = np.array(trainingweights) ## Input Variable Correlation plot correlation_plot_file_name = 'correlation_plot' Plotter.correlation_matrix(train_df) Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.png') Plotter.save_plots(dir=plots_dir, filename=correlation_plot_file_name + '.pdf') # Fit label encoder to Y_train newencoder = LabelEncoder() newencoder.fit(Y_train) # Transform to encoded array encoded_Y = newencoder.transform(Y_train) encoded_Y_test = newencoder.transform(Y_test) if do_model_fit == 1: print('<train-BinaryDNN> Training new model . . . . ') histories = [] labels = [] if hyp_param_scan == 1: print('Begin at local time: ', time.localtime()) hyp_param_scan_name = 'hyp_param_scan_results.txt' hyp_param_scan_results = open(hyp_param_scan_name, 'a') time_str = str(time.localtime()) + '\n' hyp_param_scan_results.write(time_str) hyp_param_scan_results.write(weights) learn_rates = [0.00001, 0.0001] epochs = [150, 200] batch_size = [400, 500] param_grid = dict(learn_rate=learn_rates, epochs=epochs, batch_size=batch_size) model = KerasClassifier(build_fn=gscv_model, verbose=0) grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1) grid_result = grid.fit(X_train, Y_train, shuffle=True, sample_weight=trainingweights) print("Best score: %f , best params: %s" % (grid_result.best_score_, grid_result.best_params_)) hyp_param_scan_results.write( "Best score: %f , best params: %s\n" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("Mean (stdev) test score: %f (%f) with parameters: %r" % (mean, stdev, param)) hyp_param_scan_results.write( "Mean (stdev) test score: %f (%f) with parameters: %r\n" % (mean, stdev, param)) exit() else: # Define model for analysis early_stopping_monitor = EarlyStopping(patience=100, monitor='val_loss', min_delta=0.0001, verbose=1) #model = baseline_model(num_variables, learn_rate=learn_rate) model = new_model(num_variables, learn_rate=learn_rate) # Fit the model # Batch size = examples before updating weights (larger = faster training) # Epoch = One pass over data (useful for periodic logging and evaluation) #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train)) history = model.fit(X_train, Y_train, validation_split=validation_split, epochs=epochs, batch_size=batch_size, verbose=1, shuffle=True, sample_weight=trainingweights, callbacks=[early_stopping_monitor]) histories.append(history) labels.append(optimizer) # Make plot of loss function evolution #Plotter.plot_training_progress_acc(histories, labels) #acc_progress_filename = 'DNN_acc_wrt_epoch' #Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename+'.png') #Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename+'.pdf') Plotter.history_plot(history, label='loss') Plotter.save_plots(dir=plots_dir, filename='history_loss.png') Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf') Plotter.history_plot(history, label='acc') Plotter.save_plots(dir=plots_dir, filename='history_accuracy.png') Plotter.save_plots(dir=plots_dir, filename='history_accuracy.pdf') else: model_name = os.path.join(output_directory, 'model.h5') model = load_trained_model(model_name) # Node probabilities for training sample events result_probs = model.predict(np.array(X_train)) result_classes = model.predict_classes(np.array(X_train)) # Node probabilities for testing sample events result_probs_test = model.predict(np.array(X_test)) result_classes_test = model.predict_classes(np.array(X_test)) # Store model in file model_output_name = os.path.join(output_directory, 'model.h5') model.save(model_output_name) weights_output_name = os.path.join(output_directory, 'model_weights.h5') model.save_weights(weights_output_name) model_json = model.to_json() model_json_name = os.path.join(output_directory, 'model_serialised.json') with open(model_json_name, 'w') as json_file: json_file.write(model_json) model.summary() model_schematic_name = os.path.join(output_directory, 'model_schematic.png') #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True) print('================') print('Training event labels: ', len(Y_train)) print('Training event probs', len(result_probs)) print('Training event weights: ', len(train_weights)) print('Testing events: ', len(Y_test)) print('Testing event probs', len(result_probs_test)) print('Testing event weights: ', len(test_weights)) print('================') # Initialise output directory. Plotter.plots_directory = plots_dir Plotter.output_directory = output_directory Plotter.ROC(model, X_test, Y_test, X_train, Y_train) Plotter.save_plots(dir=plots_dir, filename='ROC.png') Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
le = None # grab all image paths in the current config.TRAIN print("[INFO] processing '{} '...".format(config.TRAIN)) p = os.path.sep.join([config.BASE_PATH, config.TRAIN]) imagePaths = list(paths.list_images(p)) # randomly shuffle the image paths and then extract the class # labels from the file paths random.shuffle(imagePaths) labels = [p.split(os.path.sep)[-2] for p in imagePaths] # if the label encoder is None, create it if le is None: le = LabelEncoder() le.fit(labels) # open the output CSV file for writing csvPath = os.path.sep.join( [config.BASE_CSV_PATH, "{}.csv".format(config.TRAIN)]) # write in the fierld names of the CSV, not the length of the header depends on the CNN output cols = ["feat_{}".format(i) for i in range(0, 7 * 7 * 1024)] fieldnames = ["class"] + cols with open(csvPath, 'w', newline='') as outcsv: writer = csv.writer(outcsv) writer.writerow(fieldnames) # loop over the images in batches for (b, i) in enumerate(range(0, len(imagePaths), config.BATCH_SIZE)): # extract the batch of images and labels, then initialize the
opt_tau = thresholds[opt_idx] # Accuracy at maximal F1 score opt_acc = accuracy_score(identical, distances < opt_tau) # Plot F1 score and accuracy as function of distance threshold plt.plot(thresholds, f1_scores, label='F1 score') plt.plot(thresholds, acc_scores, label='Accuracy') plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold') plt.title("Accuracy at threshold " + str(opt_tau) + " = " + str(opt_acc)) plt.xlabel('Distance threshold') plt.legend() targets = np.array([m.name for m in metadata]) encoder = LabelEncoder() encoder.fit(targets) # Numerical encoding of identities y = encoder.transform(targets) train_idx = np.arange(metadata.shape[0]) % 2 != 0 test_idx = np.arange(metadata.shape[0]) % 2 == 0 # 50 train examples of 10 identities (5 examples each) X_train = embedded[train_idx] # 50 test examples of 10 identities (5 examples each) X_test = embedded[test_idx] y_train = y[train_idx] y_test = y[test_idx]
prefix="combine_model") if __name__ == '__main__': # load dataset management file and gene count matrix df = pd.read_csv(os.path.join(DATASET_PATH, "dataset.tsv"), header=0, sep='\t', index_col=0) sorted_cm = pd.read_csv(os.path.join(DATASET_PATH, "cm_final.tsv"), header=0, sep='\t', index_col=0) # convert categorical string label to numerical label label_encoder = LabelEncoder() label_encoder.fit(list(set(sorted_cm.loc[:, LABEL_COLUMN]))) class_list = label_encoder.classes_ # split training data and test data by individual samples df["sample"] = df.index.map(find_sample_name) df_sample_label = df.groupby("sample").first() train_index, test_index, _, _ = train_test_split( df_sample_label.index, df_sample_label.label, train_size=train_ratio, stratify=df_sample_label.label) # subset training and test gene count matrix train_cm = sorted_cm[df["sample"].isin(train_index)] test_cm = sorted_cm[df["sample"].isin(test_index)] # calculate number of genes cm_shape = train_cm.shape[1] - ADDITIONAL_COLUMN if cross_validation:
ages = titanic_reduced.iloc[:, 1] # get age column ages = np.array(ages) mean_age = np.mean(ages[~np.isnan(ages)]) # vypocita mean veku ages[np.isnan(ages)] = mean_age # replace nan with mean age titanic_reduced["age"] = ages # repalace data in dataframe print("\nPassanger data") print(titanic_reduced.loc[12]) # konverzia pohlavia male/female na 0/1 enc = LabelEncoder() label_encoder = enc.fit(titanic_reduced["sex"]) #print "Categorical classes:", label_encoder.classes_ integer_classes = label_encoder.transform(label_encoder.classes_) #print "Integer classes:", integer_classes titanic_reduced["sex"] = label_encoder.transform(titanic_reduced["sex"]) #titanic_X[:, 2] = t print('\nKodovanie kategorickych hodnot \n', ) print("Passanger data") print(titanic_reduced.loc[12]) # ONE HOT ENCODING # First, convert clases to 0-(N-1) integers using label_encoder
def create_dataset(train_path, test_path=None, valid_size=0.1, batch_size=32): X_train, X_val, y_train, y_val = utils.read_data( data_path=train_path, valid_size=valid_size) enc = LabelEncoder() enc.fit(y_train) print(enc.classes_) with open("models/label_enc.pkl", "wb") as f: pickle.dump(enc, f) y_train = enc.transform(y_train) y_train = to_categorical(y_train) train_gen = image.ImageDataGenerator( preprocessing_function=preprocess_input, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2, horizontal_flip=True) train_gen.fit(X_train) y_val = enc.transform(y_val) y_val = to_categorical(y_val) valid_gen = image.ImageDataGenerator( preprocessing_function=preprocess_input) valid_gen.fit(X_val) print("train: {}".format(len(y_train))) print("valid: {}".format(len(y_val))) if test_path is not None: X_test, y_test = utils.read_data( data_path=test_path, valid_size=0) y_test = enc.transform(y_test) y_test = to_categorical(y_test) test_gen = image.ImageDataGenerator( preprocessing_function=preprocess_input) test_gen.fit(X_test) train_gen = train_gen.flow(X_train, y_train, batch_size) valid_gen = valid_gen.flow(X_val, y_val, batch_size) test_gen = test_gen.flow(X_test, y_test, batch_size) print("test: {}".format(len(y_test))) return train_gen, valid_gen, test_gen valid_gen = valid_gen.flow(X_val, y_val, batch_size) test_gen = test_gen.flow(X_test, y_test, batch_size) return train_gen, valid_gen
class AudioDataGenerator: def __init__(self, audio_representation: AudioRepresentation, kept_labels: List[str]): self._converter = AudioRepresentationConverterFactory.create_converter( audio_representation) self._encoder = LabelEncoder() self._num_classes = len(kept_labels) self._encoder.fit(kept_labels) def _read_wavfile(self, sample_filepath): file_data = wavfile.read(sample_filepath) samples = file_data[1] sr = file_data[0] if len(samples) >= sr: samples = samples else: samples = np.pad( samples, pad_width=(sr - len(samples), 0), mode="constant", constant_values=(0, 0), ) return sr, samples def get_data_shape(self, sample_filepath: Path): converted_sample = self._converter.convert_audio_signal( [self._read_wavfile(sample_filepath)])[0] return converted_sample.shape def flow(self, samples: List[Tuple[Path, str]], batch_size: int): random.shuffle(samples) while True: for chunk in chunks(samples, batch_size): files = [self._read_wavfile(path) for path, _ in chunk] converted = self._converter.convert_audio_signal(files) labels = [label for _, label in chunk] X = np.concatenate([converted]) y = to_categorical(self._encoder.transform(labels), self._num_classes) yield X, y def flow_in_memory(self, samples: List[Tuple[Path, str]], batch_size: int): random.shuffle(samples) data = [] for chunk in chunks(samples, batch_size): files = [self._read_wavfile(path) for path, _ in chunk] converted = self._converter.convert_audio_signal(files) labels = [label for _, label in chunk] data.append(( np.concatenate([converted]), to_categorical(self._encoder.transform(labels), num_classes=self._num_classes), )) while True: for chunk in data: yield chunk
numpy.random.seed(0) dataframe = pandas.read_csv("iris.csv", header=None) dataset = dataframe.values X = dataset[:, 0:4].astype(float) Y = dataset[:, 4] # Preprocess the labels # LabelEncoder from scikit-learn turns each text label # (e.g "Iris-setosa", "Iris-versicolor") into a vector # In this case, each of the three labels are just assigned # a number from 0-2. encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # to_categorical converts the numbered labels into a one-hot vector dummy_y = np_utils.to_categorical(encoded_Y) def baseline_model(): model = Sequential() model.add(Dense(4, input_dim=4, init='normal', activation='relu')) model.add(Dense(3, init='normal', activation='sigmoid')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) return model
def main(): epochs = 200 sequence_len = 6 glimse_size= [40 , 40 , 40] std = 0.2 h_g = h_l = 512 hidden_size = 512 num_classes = 2 batch_size = 10 learning_rate = 1e-4 data_path = "/home/faltay/Dataset_Binary_Mirrored_Rot" dementia_labels_path = "/home/faltay/3DCNN/Labels_Binary.pkl" # load preprocessed dementia types save_model_path = "/home/faltay/Glimpse/Saved_Models_200" plot_path = "/home/faltay/Glimpse/Saved_Models_200/Plots/" # image transformation img_x = 224 img_y = 160 begin_frame, end_frame, skip_frame = 70, 190, 1 print("[INFO] Starting ...") # Detect devices use_cuda = T.cuda.is_available() # check if GPU exists T.cuda.set_device(1) device = T.device("cuda" if use_cuda else "cpu") # use CPU or GPU # init tensorboard, a Plugin used for visualize the loss and acc logs_dir = './logs/' if not os.path.exists(logs_dir): os.makedirs(logs_dir) tensorboard_dir = logs_dir configure(tensorboard_dir) #model = neuro_dram_net.NeuroDram_network() model = original_model.RecurrentAttention(glimse_size, h_g, h_l, std, hidden_size, num_classes) # Whole line added by FATIH # Total number of parameters pytorch_total_params = sum(p.numel() for p in model.parameters()) # Total number of trainable parameters pytorch_total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) model.to(device) # Added by FATIH optimizer = optim.AdamW(model.parameters(), lr=learning_rate, amsgrad=False) #optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.1, amsgrad=False) print("[INFO] Data is now loading ...") # Load data # load dementia types names with open(dementia_labels_path, 'rb') as f: action_names = pickle.load(f) # load dementia types names # convert labels -> category le = LabelEncoder() le.fit(action_names) # show how many classes there are list(le.classes_) # convert category -> 1-hot action_category = le.transform(action_names).reshape(-1, 1) enc = OneHotEncoder() enc.fit(action_category) dementia_type = [] fnames = os.listdir(data_path) all_names = [] for f in fnames: if f != ".ipynb_checkpoints": loc1 = f.find('_') test = (f[0: loc1]) # Temporarily we do not classify the unknown patients if test != "4" and test != "3": dementia_type.append(f[0: loc1]) all_names.append(f) # list all data files all_X_list = all_names # all video file names all_y_list = labels2cat(le, dementia_type) # all video labels # train, test, val split so no patients are repeated patients = [] labels = [] prev = [] index = 0 for pat in all_X_list: prev.append(pat[2:10]) for pat in all_X_list: if pat[2:10] not in patients: index = prev.index(pat[2:10]) patients.append(pat[2:10]) labels.append(dementia_type[index]) ## Undersampling process i = 0 for lab in labels: if lab == "0": undersampling = np.random.choice([2,5], p=[0.9, 0.1]) if undersampling ==2: del labels[i] del patients[i] i = i + 1 i = 0 for lab in labels: if lab == "0": undersampling = np.random.choice([2,5], p=[0.4, 0.6]) if undersampling ==2: del labels[i] del patients[i] i = i + 1 train_list_prev, val_list, train_label_prev, val_label = train_test_split(patients, labels, test_size=0.20, random_state=42) train_list,test_list , train_label, test_label = train_test_split(train_list_prev, train_label_prev, test_size=0.15, random_state=42) # Training set (To match patients with scanners) train_list_def = [] train_label_def = [] for pat in train_list: for scan in all_X_list: if pat == scan[2:10]: train_list_def.append(scan) for scan in train_list_def: index = all_X_list.index(scan) train_label_def.append(dementia_type[index]) train_list = train_list_def train_label = np.array(train_label_def).astype(np.int) # Validation set val_list_def = [] val_label_def = [] for pat in val_list: for scan in all_X_list: if pat == scan[2:10]: val_list_def.append(scan) for scan in val_list_def: index = all_X_list.index(scan) val_label_def.append(dementia_type[index]) val_list = val_list_def val_label = np.array(val_label_def).astype(np.int) # Test set test_list_def = [] test_label_def = [] for pat in test_list: for scan in all_X_list: if pat == scan[2:10]: test_list_def.append(scan) for scan in test_list_def: index = all_X_list.index(scan) test_label_def.append(dementia_type[index]) test_list = test_list_def test_label = np.array(test_label_def).astype(np.int) # print("Size Dataset: ",len(all_X_list)) # print("Train Dataset: ",len(train_list)) # print("Test Dataset: ",len(test_list)) # print("Val Dataset: ",len(val_list)) save(save_model_path+'/train_list.npy', train_list) save(save_model_path+'/test_list.npy', test_list) save(save_model_path+'/train_label.npy', train_label) save(save_model_path+'/test_label.npy', test_label) save(save_model_path+'/val_list.npy', val_list) save(save_model_path+'/val_label.npy', val_label) # image transformation transform = transforms.Compose([transforms.Resize([img_x, img_y]), transforms.ToTensor(), transforms.Normalize(mean=[0.5], std=[0.5])]) selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist() dataset_train = brain_data_loader.BrainDataset(data_path, train_list, train_label, selected_frames, transform = transform) dataset_validation = brain_data_loader.BrainDataset(data_path, val_list, val_label, selected_frames, transform = transform) dataset_test = brain_data_loader.BrainDataset(data_path, test_list, test_label, selected_frames, transform = transform) # # Dealing with class imbalances def make_weights_for_balanced_classes(images, nclasses): count = [0] * nclasses for item in images: count[item[1]] += 1 print(count, end="\r") weight_per_class = [0.] * nclasses N = float(sum(count)) print("") for i in range(nclasses): weight_per_class[i] = N/float(count[i]) print(weight_per_class[i], end="\r") weight = [0] * len(images) print("") for idx, val in enumerate(images): weight[idx] = weight_per_class[val[1]] print(weight[idx], end="\r") return weight weights = make_weights_for_balanced_classes(dataset_train, num_classes) weights = torch.DoubleTensor(weights) samp = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights),replacement=True) # load dementia types names params_train = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 4, 'pin_memory': True, 'sampler':samp} if use_cuda else {} #params_train = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 4, 'pin_memory': True} if use_cuda else {} params_val = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if use_cuda else {} train_loader = data.DataLoader(dataset_train, **params_train) valid_loader = data.DataLoader(dataset_validation, **params_val) test_loader = data.DataLoader(dataset_test, **params_val) print("[INFO] Data is loaded. ") # start training for epoch in range(epochs): # train, test model train_loss, train_acc, imgs, locs = train(model, train_loader, optimizer, sequence_len, epoch, device) with torch.no_grad(): valid_acc = validation(model, valid_loader, optimizer, sequence_len, epoch, device,save_model_path, test=False) # print messages msg1 = "[Training] train loss: {:.3f} - train acc: {:.3f} " msg2 = " [Validation] val acc: {:.3f}" msg = msg1 + msg2 print(msg.format(train_loss, train_acc, valid_acc)) if train_acc > 98: print("Entro ") pickle.dump(imgs, open(plot_path + str(train_acc) + "train_g_{}.p".format(epoch),"wb")) pickle.dump(locs, open(plot_path + str(train_acc) +"train_l_{}.p".format(epoch),"wb")) # Test test_acc = validation(model, test_loader, optimizer , sequence_len, epoch, device, save_model_path,test=True)