def oneHotEncoding(): print("-----------Try to ONE HOT ENCODING-----------------") setToCompare = 'abcdefghijklmnopqrstuvwxyz ' ctoi = dict((c, i) for i, c in enumerate(setToCompare)) itoc = dict((i, c) for i, c in enumerate(setToCompare)) # integer encode input data integer_encoded = [ctoi[char] for char in musicdata] print(integer_encoded) # one hot encode onehot = list() for value in integer_encoded: letter = [0 for _ in range(len(setToCompare))] letter[value] = 1 onehot.append(letter) print(onehot) # invert encoding inverted = itoc[np.argmax(onehot[0])] print(inverted) print( "--------------------------ENCODING IN PROGRESS----------------------") labelencoder = LE() X1 = X Y1 = Y X1[:, 0] = labelencoder.fit_transform(X1[:, 0]) onehotencoder = OHE([0]) X1 = onehotencoder.fit_transform(X1) labelencoderY = LE() Y1 = labelencoderY.fit_transform(Y1) print(X1) print(Y1)
def __init__(self, target_building, method, load_from_file=1): self.time_range = (None, None) if not load_from_file: #data features ids, self.fd = get_data_features(target_building, self.time_range[0], self.time_range[1]) print('%d data streams loaded' % len(ids)) #labels res = get_labels(target_building) label = [res[srcid] for srcid in ids] le = LE() self.label = le.fit_transform(label) res = [self.fd, self.label] with open('./%s_fs.pkl' % (target_building), 'wb') as wf: pk.dump(res, wf) else: with open('./%s_fs.pkl' % (target_building), 'rb') as rf: res = pk.load(rf) self.fd, self.label = res[0], res[1] print('# of classes:', len(set(self.label))) print('data features for %s with dim:' % target_building, np.concatenate(self.fd, axis=1).shape) self.method = method self.building = target_building
def partition_data(self,args): method, j = args if method== "vi": dp = BayesianGaussianMixture(n_components = self.K,weight_concentration_prior = self.alpha, max_iter=1,init_params='kmeans',weight_concentration_prior_type='dirichlet_process') dp.fit(self.X[self.U[j]]) Z = dp.predict(self.X[self.U[j]]).astype(int) Z_star = dp.predict(self.X_star).astype(int) if method=="gmm": Z,Z_star= self.uncollapsed_dp_partition_alt(j) elif method=="kmean": km = KMeans(n_clusters=self.K) Z = km.fit_predict(self.X[self.U[j]]).astype(int) Z_star = km.predict(self.X_star[self.U[j]]).astype(int) else: Z = np.random.choice(self.K,size = self.N_minibatch,replace=True) Z_star = np.random.choice(np.unique(Z),size = self.N_star,replace=True) le = LE() le.fit(np.hstack((Z,Z_star))) Z = le.transform(Z) Z_star = le.transform(Z_star) if (method=="vi"): #& (self.vi_partition): Z_diff = np.setdiff1d(Z_star,Z) if Z_diff.size > 0: idx = np.hstack((np.where(Z_star==k) for k in Z_diff)).flatten() unique_Z = np.unique(Z) post_Z = dp.predict_proba(self.X_star[idx])[:,unique_Z] Z_star[idx] = [np.random.choice(unique_Z,p = post_Z_i / post_Z_i.sum() ) for post_Z_i in post_Z] assert(np.setdiff1d(Z_star,Z).size == 0) return(Z,Z_star)
def __init__(self, articles_folder=None, labels_file=None): super().__init__(articles_folder, labels_file) self.clean() self.sentences = [ "[CLS] " + sentence + " [SEP]" for sentence in self.sentences ] self.le = LE() self.labels = self.le.fit_transform(self.gold_labels)
def labele(tbl, cols='all'): from sklearn.preprocessing import LabelEncoder as LE if cols == 'all': cols = tbl.columns le = LE() for ac in tbl.columns: tbl.loc[:, ac] = le.fit(tbl[ac]).transform( tbl[ac]) #might have to return le return tbl
def main(): features = [ 'time1', 'id1', 'ppid1', 'pid1', 'exe1', 'time2', 'id2', 'ppid2', 'pid2', 'exe2', 'time3', 'id3', 'ppid3', 'pid3', 'exe3' ] try: fmatrix = pd.read_csv('feature_matrix.csv', header=0) except: fmatrix = pd.DataFrame({}) for ftype in ['benign', 'malicious']: for i in range(1, 50): fvfile = 'fv_' + ftype + str(i) + '.out' timefile = 'times_' + ftype + str(i) + '.txt' fv = pd.read_csv(fvfile, header=0) with open(timefile) as t: times = re.findall('(start|end)=(\d+)', t.read()) start = int(times[0][1]) end = int(times[1][1]) if ftype == 'benign': fv['label'] = [0] * len(fv) else: fv['label'] = fv['time1'].apply(payload, args=(start, end)) fmatrix = pd.concat((fmatrix, fv), ignore_index=True) for f in features: fmatrix[f] = LE().fit_transform(fmatrix[f]) fmatrix.to_csv('feature_matrix.csv', index=False) #clf = xgb.XGBClassifier() clf = RandomForestClassifier() le = {f: LE() for f in features} train, test = train_test_split(fmatrix) clf.fit(train[features], train['label']) ypred = clf.predict_proba(test[features]) fpr, tpr, _ = roc_curve(test['label'], ypred[:, 1]) layout = Layout(title='ROC curve for random forest with all features', xaxis=dict(title='False positive rate'), yaxis=dict(title='True positive rate'), showlegend=True) fig = Figure(data=[ Scatter(x=fpr, y=tpr, mode='lines', name='AUC %f' % auc(fpr, tpr)) ], layout=layout) py.plot(fig, filename='rf feature roc', auto_open=False)
def partition_data(self,j): dp = BayesianGaussianMixture(n_components = int(self.alpha*np.log(self.N)) ,weight_concentration_prior = self.alpha, init_params='kmeans',weight_concentration_prior_type='dirichlet_process') Z = dp.fit_predict(self.X[self.U[j]]) le = LE() Z = le.fit_transform(Z) Z_count = np.bincount(Z) assert(Z.max()+1 == Z_count.size) self.K[j] = int(Z_count.size) self.marginal_LL_k[j] = {k:0 for k in range(int(self.K[j])) } return(Z,Z_count)
def __init__(self, faqslist, type): self.faqslist = faqslist self.stemmer = LancasterStemmer() self.le = LE() self.vectorizers = { "tfidf": TfidfVectorGenerator(), "doc2vec": Doc2VecGenerator(), "bert": BertGenerator(), "sent2vec": Sent2VecGenerator() } self.build_model(type)
def __init__(self, faqslist): self.faqslist = faqslist self.stemmer = LancasterStemmer() self.le = LE() self.vectorizer = TfidfVectorizer(min_df=1, stop_words='english') dataframeslist = [ pd.read_csv(csvfile).dropna() for csvfile in self.faqslist ] self.data = pd.concat(dataframeslist, ignore_index=True) self.questions = self.data['Question'].values self.build_model()
def test_data_prep(csv_file): train_data = pd.read_csv(os.path.abspath(csv_file)) train_data = train_data.set_index('PassengerId') train_data = train_data.drop(['Cabin','Name','Ticket'],axis = 1) train_data['Age'] = train_data['Age'].fillna(value = train_data['Age'].mean()) train_data['Fare'] = train_data['Age'].fillna(value = train_data['Fare'].mean()) train_data = train_data.dropna(axis=0) le = LE() train_data['Sex'] = le.fit_transform(train_data['Sex']) train_data['Embarked'] = le.fit_transform(train_data['Embarked']) train_data['Age'] = (tf.keras.utils.normalize(np.array(train_data['Age']),order=2)).reshape(-1,1) train_data['Fare'] = (tf.keras.utils.normalize(np.array(train_data['Fare']),order=2)).reshape(-1,1) return(train_data)
def _load_dataset(): df_train = pd.read_csv(ENROLL_TRAIN) df_train['dataset'] = 'train' df_test = pd.read_csv(ENROLL_TEST) df_test['dataset'] = 'test' df = pd.concat([df_train, df_test]) truth_df = pd.read_csv(TRUTH_TRAIN, names=[ 'enrollment_id', 'target', ]).replace(1.0, 'dropout').replace(0.0, 'continue') df = df.merge(truth_df, how='left', on='enrollment_id') df['target'] = df['target'].fillna('testcase') df['course_num'] = LE().fit_transform(df['course_id']) return df
def _class_q(y_true=0, y_pred=0): labels_en = LE() labels_en.fit(y_true) y_pred = labels_en.fit(y_pred) try: a = accuracy_score(y_pred=y_pred, y_true=y_true, pos_label=labels_en.classes_) except: a = np.nan try: fpr, tpr, thresholds = roc_curve(y_true, y_pred) b = auc(fpr, tpr) except: b = np.nan try: c = average_precision_score( y_true, y_pred, ) except: c = np.nan try: d = balanced_accuracy_score(y_pred=y_pred, y_true=y_true) except: d = np.nan #e = roc_auc_score(y_true,y_pred,'micro' ) erg = np.zeros((1, 4)) erg[0, 0] = a erg[0, 1] = b erg[0, 2] = c erg[0, 3] = d res_class = pd.DataFrame(data=erg, columns=[ 'accuracy_score', 'auc', 'average_precision_score', 'balanced_accuracy_score' ]) return res_class
def train_data_prep(csv_file): train_data = pd.read_csv(os.path.abspath(csv_file)) train_data = train_data.set_index('PassengerId') train_data = train_data.drop(['Cabin','Name','Ticket'],axis = 1) train_data['Age'] = train_data['Age'].fillna(value = train_data['Age'].mean()) train_data['Fare'] = train_data['Age'].fillna(value = train_data['Fare'].mean()) train_data = train_data.dropna(axis=0) le = LE() train_data['Sex'] = le.fit_transform(train_data['Sex']) train_data['Embarked'] = le.fit_transform(train_data['Embarked']) train_data['Age'] = (tf.keras.utils.normalize(np.array(train_data['Age']),order=2)).reshape(-1,1) train_data['Fare'] = (tf.keras.utils.normalize(np.array(train_data['Fare']),order=2)).reshape(-1,1) # train_data['Pclass'] = (tf.keras.utils.normalize(np.array(train_data['Pclass']),order=2)).reshape(-1,1) # train_data['Sex'] = (tf.keras.utils.normalize(np.array(train_data['Sex']),order=2)).reshape(-1,1) # train_data['Embarked'] = (tf.keras.utils.normalize(np.array(train_data['Embarked']),order=2)).reshape(-1,1) # train_data['Parch'] = (tf.keras.utils.normalize(np.array(train_data['Parch']),order=2)).reshape(-1,1) X = train_data.drop('Survived',axis = 1) y = train_data['Survived'] return(train_test_split(X, y, test_size=0.10, random_state=42))
def train_model(self): le = LE() jobs = pd.read_excel("train.xls") jobs['Location'] = le.fit_transform(jobs['Location'].values) # 城市重编码 jobs['Education'] = jobs['Education'].replace('中专', 0) # 学历重编码 jobs['Education'] = jobs['Education'].replace('高中', 1) jobs['Education'] = jobs['Education'].replace('大专', 2) jobs['Education'] = jobs['Education'].replace('本科', 3) jobs['Education'] = jobs['Education'].replace('硕士', 4) ## 特征选择 X = jobs[['Location', 'Education', 'Experience']] ## 结果集 y = jobs['Salary'] ## 模型学习 model = LR() model.fit(X, y) joblib.dump( le, "G:\practical-training-main\machine_learning\model\le.model") joblib.dump( model, "G:\practical-training-main\machine_learning\model\lr.model")
def __init__( self, data_path, var_dict, n_bins_range=range(2, 21), batch_size=512, n_epoch=20, embedding_dim=16, lr=0.001, weight_decay=0.0, verbose=False, cv=10, n_init_bins_list=[5, 10, 15, 20], co_occur_cutoff=1, ): self.data = pd.read_csv(data_path) self.var_dict = var_dict self.n_bins_range = n_bins_range self.n_init_bins_list = n_init_bins_list self.semantic_binning = SemanticBinning( self.var_dict, batch_size=batch_size, n_epoch=n_epoch, embedding_dim=embedding_dim, lr=lr, weight_decay=weight_decay, verbose=verbose, co_occur_cutoff=co_occur_cutoff) self.cv = cv self.y = LE().fit_transform(self.data[var_dict['class_var']]) self.lr_params = [0.5, 1.0] self.dt_params = [3, 4, 5] self.rf_params = [10, 20, 30]
if EXPORT_MODELS: if not os.path.isdir(model_dir+'/feature_extractors'): os.makedirs(model_dir+'/feature_extractors') if not os.path.isdir(model_dir+'/pca'): os.makedirs(model_dir+'/pca') if not os.path.isdir(model_dir+'/models'): os.makedirs(model_dir+'/models') if args.toy: data = pd.read_csv(train_path, sep=',', na_values=None, na_filter=False).sample(10000) # toy else: data = pd.read_csv(train_path, sep=',', na_values=None, na_filter=False) if not TESTING: if IMPORT_MODELS and os.path.exists(model_dir+"/feature_extractors/le.pkl"): label_le = load_model(model_dir+"/feature_extractors/le.pkl") else: label_le = LE().fit(data.click) if EXPORT_MODELS: with open(model_dir+"/feature_extractors/le.pkl", "wb") as f: # export pca transformer pickle.dump(label_le, f, pickle.HIGHEST_PROTOCOL) label = label_le.transform(data.click) del data['click'] # 記得別讓答案變成一組 feature ,這樣 model 就直接看到答案了 # 特徵選擇、降維 改交給 SVD 分解完成 selected_col = ['spaceType','spaceId','adType','os','deviceType','campaignId','advertiserId'] data = data[selected_col] if IMPORT_MODELS and os.path.exists(model_dir+"/feature_extractors/dv.pkl"): dv = load_model(model_dir+"/feature_extractors/dv.pkl") else: dv = DictVectorizer(sparse=False).fit(data.T.to_dict().values()) # 要執行這步,你/妳的 RAM 要夠大 (>8G 一定沒問題) if EXPORT_MODELS:
# In[8]: train_data = train_data.dropna(axis=0) # In[9]: pd.isna(train_data).sum() # In[10]: train_data[0:10] # In[11]: le = LE() train_data['Sex'] = le.fit_transform(train_data['Sex']) train_data['Embarked'] = le.fit_transform(train_data['Embarked']) # In[12]: train_data[0:10] # In[13]: train_data.describe() # In[14]: train_data['Age'] = (tf.keras.utils.normalize(np.array(train_data['Age']), order=2)).reshape(-1, 1)
def data_transform(sl_data, tk_data, tt_data, us_data): ''' Merge serviceline data with ticket data Take distinct values in tags, group names, request types Create dummy variables for each ''' #@local Variables #CATEG = [' le = LE() #CATEG = ['device_type', 'plan_var'] try: db_conn = ms.connect("localhost", "root", "root", "churn_model") print('MySQL connection:\t\t\t[OK]') except: print('MySQL connection:\t\t\t[Fail]') try: db_curs = db_conn.cursor() print('Cursor:\t\t\t[OK]') except: print('Cursor:\t\t\t[Fail]') sl_data = sl_data[[ 'sl_uuid', 'churn_flag', 'device_switch', 'customer_life', 'device_type', 'multiline_flag', 'plan_var', 'device_life' ]] tk_data = tk_data[['sl_uuid', 'num_tickets', 'first_time', 'full_time']] tt_data = tt_data[[ 'sl_uuid', 'group_name', 'request_type', 'ticket_class', 'ticket_type', 'tags', 'diff_date' ]] us_data = us_data[['sl_uuid', 'voice_dt', 'data_dt']] #---------PROCESS STEP:- MERGE DATA:: service line + number of tickets model_data = pd.merge(sl_data, tk_data, on=['sl_uuid'], how='left') model_data = pd.merge(model_data, us_data, on=['sl_uuid'], how='left') model_data = model_data.dropna(subset=['voice_dt'], how='all') print model_data.columns #---------PROCESS STEP:- Get distinct values of dummy variables device_name = sl_data['device_type'].unique().tolist() plan_name = sl_data['plan_var'].unique().tolist() group_name = tt_data['group_name'].unique().tolist() request_type = tt_data['request_type'].unique().tolist() ticket_class = tt_data['ticket_class'].unique().tolist() ticket_type = tt_data['ticket_type'].unique().tolist() tags = tt_data['tags'].tolist() tags_distinct = [] for i in tags: split_strings = i.split(' ') for j in split_strings: if j not in tags_distinct: tags_distinct.append(j) #@substep: write the unique values to files group_file = open('Dummy variables/Group_name.txt', 'w') request_file = open('Dummy variables/Request_type.txt', 'w') tclass_file = open('Dummy variables/Ticket_class.txt', 'w') tag_file = open('Dummy variables/Tag_name.txt', 'w') ttype_file = open('Dummy variables/Ticket_type.txt', 'w') for g in group_name: group_file.write(str(g) + '') group_file.write('\n') for r in request_type: request_file.write(str(r) + '') request_file.write('\n') for tc in ticket_class: tclass_file.write(str(tc) + '') tclass_file.write('\n') for t in tags_distinct: tag_file.write(str(t) + '') tag_file.write('\n') for tt in ticket_type: ttype_file.write(str(tt) + '') ttype_file.write('\n') print("Writing to files:\t\t\t[OK]") group_file.close() tclass_file.close() tag_file.close() ttype_file.close() request_file.close() dummy_var_header = group_name + request_type + ticket_class + ticket_type dummy_var_header2 = device_name + plan_name dummy = {} sl_uuid = model_data['sl_uuid'].unique() #Create dataframes for ticket type and then use that to merge with serviceline data group_df = pd.get_dummies(tt_data['group_name']) group_df['sl_uuid'] = tt_data['sl_uuid'] group_df = group_df.groupby(['sl_uuid']).sum() group_df['sl_uuid'] = group_df.index.tolist() print tt_data.head() print group_df.head() request_df = pd.get_dummies(tt_data['request_type']) request_df['sl_uuid'] = tt_data['sl_uuid'] request_df = request_df.groupby(['sl_uuid']).sum() request_df['sl_uuid'] = request_df.index.tolist() class_df = pd.get_dummies(tt_data['ticket_class']) class_df['sl_uuid'] = tt_data['sl_uuid'] class_df = class_df.groupby(['sl_uuid']).sum() class_df['sl_uuid'] = class_df.index.tolist() type_df = pd.get_dummies(tt_data['ticket_type']) type_df['sl_uuid'] = tt_data['sl_uuid'] type_df = type_df.groupby(['sl_uuid']).sum() type_df['sl_uuid'] = type_df.index.tolist() device_df = pd.get_dummies(sl_data['device_type']) device_df['sl_uuid'] = model_data['sl_uuid'] plan_df = pd.get_dummies(sl_data['plan_var']) plan_df['sl_uuid'] = model_data['sl_uuid'] print len(model_data) #merge model_df = pd.merge(model_data[['sl_uuid']], group_df, on=['sl_uuid'], how='left') model_df.fillna(0, inplace=True) model_df = pd.merge(model_df, request_df, on=['sl_uuid'], how='left') model_df.fillna(0, inplace=True) model_df = pd.merge(model_df, class_df, on=['sl_uuid'], how='left') model_df.fillna(0, inplace=True) model_df = pd.merge(model_df, type_df, on=['sl_uuid'], how='left') model_df.fillna(0, inplace=True) model_df = pd.merge(model_df, device_df, on=['sl_uuid'], how='left') print model_df.columns.tolist() model_df = pd.merge(model_df, plan_df, on=['sl_uuid'], how='left') print model_df.columns.tolist() model_df = model_df.dropna() print len(model_df) ''' for i in sl_uuid: subset = tt_data[tt_data['sl_uuid'] == i] subset2 = sl_data[sl_data['sl_uuid'] == i] dummy_indiv = [] if len(subset) > 0: #@comment - has some tickets and can be worked on creating dummy #group_name for x in group_name: if x in subset['group_name'].tolist(): dummy_indiv.append(subset['group_name'].tolist().count(x)) else: dummy_indiv.append(0) for x in request_type: if x in subset['request_type'].tolist(): dummy_indiv.append(subset['request_type'].tolist().count(x)) else: dummy_indiv.append(0) for x in ticket_class: if x in subset['ticket_class'].tolist(): dummy_indiv.append(subset['ticket_class'].tolist().count(x)) else: dummy_indiv.append(0) for x in ticket_type: if x in subset['ticket_type'].tolist(): dummy_indiv.append(subset['ticket_type'].tolist().count(x)) else: dummy_indiv.append(0) ''' ''' sl_tags = [] temp = subset['tags'].tolist() for y in temp: sl_tags += y.split(' ') sl_tags = list(set(sl_tags)) for x in tags_distinct: if x in sl_tags: dummy_indiv.append(1) else: dummy_indiv.append(0) ''' ''' dummy[i] = dummy_indiv else: dummy[i] = [0] * len(dummy_var_header) if len(subset2) > 0: print "entering subset2" dummy_indiv = [] for x in device_name: if x in subset2['device_type'].tolist(): dummy_indiv.append(subset2['device_type'].tolist().count(x)) else: dummy_indiv.append(0) for x in plan_name: if x in subset2['plan_var'].tolist(): dummy_indiv.append(subset2['plan_var'].tolist().count(x)) else: dummy_indiv.append(0) dummy[i]+= dummy_indiv else: dummy[i] += [0] * len(dummy_var_header2) print sl_uuid.tolist().index(i) print dummy[i] ''' ''' model_df = pd.DataFrame.from_dict(dummy, 'index') model_df.columns = dummy_var_header ''' model_df = pd.merge(model_df, model_data, on=['sl_uuid'], how='left') col = model_df.columns.tolist() col.remove('device_type') col.remove('plan_var') model_df = model_df[col].copy() model_df['num_tickets'] = model_df['num_tickets'].fillna(0) col = model_df.columns.tolist() #col.remove('customer_life') #@sql write: to local db -> model dataset try: db_curs.execute('DROP TABLE model_data;') except: print('Table not present') ''' model_df = model_data model_df['num_tickets'] = model_df['num_tickets'].fillna(0) col = model_df.columns.tolist() col.remove('num_tickets') model_df = model_df[['device_switch', 'churn_flag', 'sl_uuid']] pk.dump(model_df, open('/home/rsrash1990/Ravi Files/Work Projects/Churn Model/Pickle data/model_data.pk','w')) ''' #model_df.to_sql(con=db_conn, name='model_data', if_exists='append', flavor='mysql') #print model_df.head() #print model_df.describe() ''' for i in model_df.columns: temp = model_df[i].tolist() print (i + '\t:' + str(sum(x is None for x in temp))) ''' #model = model_df[['sl_uuid', 'churn_flag']] pk.dump( model_df, open( '/home/rsrash1990/Ravi Files/Work Projects/Churn Model/Pickle data/model_df.pk', 'w')) return model_df
def preprocessing(dataset): for col in dataset.columns: le = LE() dataset[col] = le.fit_transform(dataset[col]) return dataset
def __init__(self, target_building, target_srcids, fold, rounds, use_all_metadata=False, source_building=None ): super(ActiveLearningInterface, self).__init__( target_building=target_building, target_srcids=target_srcids ) srcids = [point['srcid'] for point in LabeledMetadata.objects(building=target_building)] pt_type = [LabeledMetadata.objects(srcid=srcid).first().point_tagset for srcid in srcids] if use_all_metadata: pt_name = [] for srcid in srcids: raw_metadata = RawMetadata.objects(srcid=srcid).first().metadata sentence = [] sentence = '\n'.join([raw_metadata.get(metadata_type, '') for metadata_type in ['VendorGivenName', 'BACnetName', 'BACnetDescription'] ]) pt_name.append(sentence) else: pt_name = [RawMetadata.objects(srcid=srcid).first() .metadata['VendorGivenName'] for srcid in srcids] fn = get_name_features(pt_name) le = LE() try: le.fit(pt_type) except: pdb.set_trace() transfer_fn = [] transfer_label = [] if source_building: srcids = [point['srcid'] for point in LabeledMetadata.objects(building=source_building)] source_type = [LabeledMetadata.objects(srcid=srcid).first().point_tagset for srcid in srcids] source_name = [RawMetadata.objects(srcid=srcid).first()\ .metadata['VendorGivenName'] for srcid in srcids] fn_all = get_name_features( pt_name + source_name ) fn = fn_all[:len(pt_name), :] transfer_fn = fn_all[len(pt_name):, :] try: le.fit( pt_type + source_type ) transfer_label = le.transform(source_type) except: pdb.set_trace() print ('%d instances loaded from transferred bldg: %s'%(len(transfer_label), source_building)) try: label = le.transform(pt_type) except: pdb.set_trace() #print ('# of classes is %d'%len(np.unique(label))) print ('running active learning by Hong on building %s'%target_building) print ('%d instances loaded'%len(pt_name)) self.learner = active_learning( fold, rounds, #2 * len( np.unique(label) ), 28, fn, label, transfer_fn, transfer_label )
q1, q3 = np.percentile(df['fractal_dimension_worst_log'], [25, 75]) iqr = q3 - q1 lower_bound = q1 - (1.5 * iqr) upper_bound = q3 + (1.5 * iqr) median = np.median(df['fractal_dimension_worst_log']) count = 0 for i in range(len(df['fractal_dimension_worst_log'])): if (df.iloc[i]['fractal_dimension_worst_log'] > upper_bound) or ( df.iloc[i]['fractal_dimension_worst_log'] < lower_bound): count = count + 1 # print((count/len(df['fractal_dimension_worst_log']))*100) # sns.countplot(x='diagnosis',data=df) from sklearn.preprocessing import LabelEncoder as LE df['diagnosis_1'] = LE().fit_transform(df['diagnosis']) # print ("\n\n", df.head(15)) df.drop([ 'radius_mean', 'Unnamed: 32', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst' ], axis=1, inplace=True)
import re import tensorflow as tf from tensorflow.contrib import learn import warnings warnings.filterwarnings('ignore') # Library for tensorflow logging import logging logging.getLogger().setLevel(logging.INFO) df = pd.read_csv("../input/train.csv") # Selecting features features = ["bone_length", "rotting_flesh", "hair_length", "color", "has_soul"] X = df[features] y = df["type"] # Encoding type (Ghost,Ghouls,Goblin) and color from sklearn.preprocessing import LabelEncoder as LE letype = LE() y = letype.fit_transform(y) lecolor = LE() X["color"] = lecolor.fit_transform(X["color"]) # splitting function used for cross validation from sklearn.cross_validation import train_test_split # current test size = 0 to permit the usage of whole training data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=9) # define a network with a single hidden RELU layer of 15 hidden units tf_clf_dnn = learn.DNNClassifier(hidden_units=[15], n_classes=3) tf_clf_dnn.fit(X_train, y_train, max_steps=5500) from sklearn.metrics import accuracy_score as as_ # print(as_(y_test,tf_clf_dnn.predict(X_test)))
data[i, 0] = data[i, 0].lower() #print(data[:,0]) ''' # Encoding loop #data_enc = LE() data_enc = np.full(8, None) for i in [0,5,6,7]: data_enc[i] = LE() data[:,i] = data_enc[i].fit_transform(data[:,i]) ohe_enc = OHE(categorical_features=[0]) #data = ohe_enc.fit_transform(data).toarray() #data = data.astype(int) print(data ,pd.DataFrame(data)) ''' # Encoding level_enc = LE() activitve_enc = LE() study_enc = LE() biography_enc = LE() data[:, 0] = level_enc.fit_transform(data[:, 0]) data[:, 5] = activitve_enc.fit_transform(data[:, 5]) data[:, 6] = study_enc.fit_transform(data[:, 6]) data[:, 7] = biography_enc.fit_transform(data[:, 7]) print(data, pd.DataFrame(data)) # 輸出模型 pickle.dump(level_enc, open('model/level_enc_model.sav', 'wb')) pickle.dump(activitve_enc, open('model/activitve_enc_model.sav', 'wb')) pickle.dump(study_enc, open('model/study_enc_model.sav', 'wb')) pickle.dump(biography_enc, open('model/biography_enc_model.sav', 'wb')) # 正規化
def __init__(self, faqslist, type='tfidf'): self.faqslist = faqslist self.stemmer = LancasterStemmer() self.le = LE() self.classifier = None self.build_model(type)
#importing the dataset dataset=pd.read_csv('Data.csv') X=dataset.iloc[:,:-1].values Y=dataset.iloc[:,3].values #taking care of missing data from sklearn.preprocessing import Imputer imputer=Imputer(missing_values='NaN',strategy='mean',axis=0) imputer=imputer.fit(X[:,1:3]) X[:,1:3]=imputer.transform(X[:,1:3]) #Encoding Categorical data from sklearn.preprocessing import LabelEncoder as LE from sklearn.preprocessing import OneHotEncoder as OE labelencoder_x=LE() X[:,0]=labelencoder_x.fit_transform(X[:,0]) onehotencoder=OE(categorical_features=[0]) X=onehotencoder.fit_transform(X).toarray() labelencoder_y=LE() Y=labelencoder_y.fit_transform(Y) #splitting the dataset into train daata seet and test data set from sklearn.model_selection import train_test_split X_Train,X_Test,Y_Train,Y_Test=train_test_split(X,Y,test_size=0.2,random_state=0) #feature scaling from sklearn.preprocessing import StandardScaler sc_x=StandardScaler() X_Train=sc_x.fit_transform(X_Train) X_Test=sc_x.transform(X_Test)
def __init__(self, target_building, target_srcids, source_buildings, config={}, load_from_file=1): super(BuildingAdapterInterface, self).__init__(target_building=target_building, source_buildings=source_buildings, target_srcids=target_srcids) #gather the source/target data and name features, labels #TODO: handle multiple source buildings self.stop_predict_flag = False if 'source_time_ranges' in config: self.source_time_ranges = config['source_time_ranges'] assert len(self.source_time_ranges) == len(source_buildings) else: self.source_time_ranges = [(None, None)]\ * len(source_buildings) if 'target_time_range' in config: self.target_time_range = config['target_time_range'] else: self.target_time_range = (None, None) if 'threshold' in config: self.threshold = config['threshold'] else: self.threshold = 0.5 source_building = source_buildings[0] if not load_from_file: #data features source_ids, train_fd = get_data_features( source_building, self.source_time_ranges[0][0], self.source_time_ranges[0][1]) target_ids, test_fd = get_data_features(target_building, self.target_time_range[0], self.target_time_range[1]) #name features, labels source_res = get_namefeatures_labels(source_building) train_label = [source_res[srcid][1] for srcid in source_ids] self.target_res = get_namefeatures_labels(target_building) test_fn = np.asarray( [self.target_res[tgtid][0] for tgtid in target_ids]) test_label = [self.target_res[tgtid][1] for tgtid in target_ids] #find the label intersection intersect = list(set(test_label) & set(train_label)) print('intersected tagsets:', intersect) #preserve the intersection, get ids for indexing data feature matrices if intersect: train_filtered = [[i, j] for i, j in enumerate(train_label) if j in intersect] train_id, train_label = [list(x) for x in zip(*train_filtered)] test_filtered = [[ i, j, k ] for i, (j, k) in enumerate(zip(test_label, target_ids)) if j in intersect] self.test_id, test_label, self.test_srcids = [ list(x) for x in zip(*test_filtered) ] else: raise ValueError('no common labels!') self.train_fd = train_fd[train_id, :] self.test_fd = test_fd[self.test_id, :] self.test_fn = test_fn[self.test_id, :] print('%d training examples left' % len(self.train_fd)) print('%d testing examples left' % len(self.test_fd)) self.le = LE() self.le.fit(intersect) self.train_label = self.le.transform(train_label) self.test_label = self.le.transform(test_label) res = [ self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le ] with open('./%s-%s.pkl' % (source_building, target_building), 'wb') as wf: pk.dump(res, wf) else: print('loading from prestored file') with open('./%s-%s.pkl' % (source_building, target_building), 'rb') as rf: res = pk.load(rf) self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le = \ res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7] print('# of classes:', len(set(self.train_label))) print('data features for %s with dim:' % source_building, self.train_fd.shape) print('data features for %s with dim:' % target_building, self.test_fd.shape) self.learner = transfer_learning(self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, threshold=self.threshold) self.run_auto()
# perform one hot encoding on categorical data of X target_cols = [ 'workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country' ] print('\n dataset = pandas.get_dummies(dataset)') X = pd.get_dummies(X, columns=target_cols) print('\n X.shape: after 1 hot') print(X.shape) # perform categorical encoding of Y since d vals are a binary set print('\n Y.head(10)') print(Y.head(10)) encoder = LE() encoder.fit(Y) encoded_Y = encoder.transform(Y) new_Y = pd.DataFrame(encoded_Y) print('\n new_Y.head(10)') print(new_Y.head(10)) # convert X and new_Y to numpy arrays X = X.values Y = new_Y.values # minmax X scaler = MMS(feature_range=(0, 1)) rescaledX = scaler.fit_transform(X)
def patient_education_level(): df = pd.concat([ pd.read_csv(g.FILE_TRAIN_OUT, usecols=['education_level']), pd.read_csv(g.FILE_TEST_OUT, usecols=['education_level']), ]) return LE().fit_transform(df.education_level).reshape((-1, 1))
l = [] l.append(np.round(item * 100, decimals=2)) print('Correlation between this group and attrition is {}%'.format(l[0])) # Feature Engineering and Preparation for Machine Learning # -------------------------------------------------------- # # Now that I've got a feel for my data I'll engage in a little feature engineering and preparation for classification. # # NOTE: Subsequent to preparing this kernel I learned that I could use the pandas.get_dummies() method to accomplish something very similar. It accomplishes the same thing and has the advantage of being native to pandas, which I always use anyway. I won't make the change to this kernel, though. # In[16]: from sklearn.preprocessing import LabelEncoder as LE data['Attrition'] = LE().fit_transform(data['Attrition']) data['Department'] = LE().fit_transform(data['Department']) data['EducationField'] = LE().fit_transform(data['EducationField']) data['Gender'] = LE().fit_transform(data['Gender']) data['JobRole'] = LE().fit_transform(data['JobRole']) data['MaritalStatus'] = LE().fit_transform(data['MaritalStatus']) data['Over18'] = LE().fit_transform(data['Over18']) data['OverTime'] = LE().fit_transform(data['OverTime']) data['BusinessTravel'] = LE().fit_transform(data['BusinessTravel']) del data['left'] del data['OT'] del data['EmployeeNumber'] del data['EmployeeCount'] # Now let's see if we can extract some goodness with a clustering algorithm. Something I have noticed with HR datasets is that employee behavior seems to exhibit a fair amount of clustering. #
def show_course_activity(course_cd=9): with open(TEMPFILE, 'rb') as f: log_df = pickle.load(f) # Load logs enr_df = load_enroll() ref_course_df = pd.read_csv(REF_COURSE_CODE) truth_df = load_truth().fillna(-1) enr_df = enr_df.merge(truth_df, how='left', on='enrollment_id') enr_df = enr_df.merge(ref_course_df, how='left', on='course_id') enr_df = enr_df[enr_df['course_cd'] == course_cd] log_df = log_df[log_df['course_cd'] == course_cd] obj_df = pd.read_csv(OBJECT, usecols=['module_id', 'category' ]).rename(columns={'module_id': 'object'}) log_df = log_df.merge(obj_df, how='left', on='object') # Encode object ids log_df = log_df.sort('time') obj_time = log_df.groupby('object').head(1).reset_index()[[ 'object', 'time' ]] obj_encoder = LE() obj_time_ls = obj_encoder.fit(obj_time['object']) uniq_obj = len(log_df['object'].unique()) uniq_obj_names = sorted(obj_df['category'].unique()) true_enr_id = enr_df[enr_df['target'] == 1].head(50) false_enr_id = enr_df[enr_df['target'] == 0].head(50) f, ax_list = plt.subplots(50, 2, figsize=(10, 13), sharex=True) # For top50 dropout enrollment for i, (idx, row) in enumerate(true_enr_id.iterrows()): enr_id = row['enrollment_id'] df = log_df[log_df['enrollment_id'] == enr_id] ax = ax_list[i, 0] sns.set_palette('husl') for category_name in uniq_obj_names: selected_df = df[df['category'] == category_name] ax.plot(selected_df['time'].map(parse_date), obj_encoder.transform(selected_df['object']), '.') ax.set_ylim((0, uniq_obj)) _change_tick_fontsize(ax, 8) dateFmt = mpl.dates.DateFormatter('%Y-%m-%d') ax.xaxis.set_major_formatter(dateFmt) daysLoc = mpl.dates.DayLocator() hoursLoc = mpl.dates.HourLocator(interval=6) ax.xaxis.set_major_locator(daysLoc) ax.xaxis.set_minor_locator(hoursLoc) for ticklabel in ax.xaxis.get_ticklabels(): ticklabel.set_rotation(80) # For top50 continue enrollment for i, (idx, row) in enumerate(false_enr_id.iterrows()): enr_id = row['enrollment_id'] df = log_df[log_df['enrollment_id'] == enr_id] ax = ax_list[i, 1] sns.set_palette('husl') for category_name in uniq_obj_names: selected_df = df[df['category'] == category_name] ax.plot(selected_df['time'].map(parse_date), obj_encoder.transform(selected_df['object']), '.') ax.set_ylim((0, uniq_obj)) _change_tick_fontsize(ax, 8) dateFmt = mpl.dates.DateFormatter('%Y-%m-%d') ax.xaxis.set_major_formatter(dateFmt) daysLoc = mpl.dates.DayLocator() hoursLoc = mpl.dates.HourLocator(interval=6) ax.xaxis.set_major_locator(daysLoc) ax.xaxis.set_minor_locator(hoursLoc) for ticklabel in ax.xaxis.get_ticklabels(): ticklabel.set_rotation(80) plt.tight_layout() plt.subplots_adjust(top=0.962, hspace=0.09) plt.savefig(OUTPUT_PATH)