def data(): PATH = "/ASTRAL186/train1/astral_train.csv" dataset = pd.read_csv(PATH) col = dataset.columns.values.tolist() col1 = col[1:] print(len(col1)) #2460 X_train = np.array(dataset[col1]) y_train = preprocessing.LabelEncoder().fit_transform(dataset['class']) print(len(y_train)) #5273 scale = StandardScaler().fit(X_train) X_train = scale.transform(X_train) PATH_ = "/ASTRAL186/test1/astral_test.csv" dataset_ = pd.read_csv(PATH_) col_ = dataset_.columns.values.tolist() col1_ = col_[1:] print(len(col1_)) #2460 X_test = np.array(dataset_[col1_]) y_test = preprocessing.LabelEncoder().fit_transform(dataset_['class']) print(len(y_test)) #1319 scale = StandardScaler().fit(X_test) X_test = scale.transform(X_test) #fs #clf=LogisticRegression(penalty='l1',C=0.1,solver='liblinear',random_state=0)########################edd+1.25mean(676)/tg+1.25mean(584)/astral+1.25mean(549)/astral_train+1.0*mean(867) clf = LinearSVC( penalty='l1', C=0.1, dual=False, random_state=0 ) ########################dd+1.5mean(584)/le+mean(554)/astral_train1+1.25mean(794) clf.fit(X_train, y_train) importance = np.linalg.norm(clf.coef_, axis=0, ord=1) mean = np.mean(importance) model = SelectFromModel(clf, prefit=True, threshold=1.25 * mean) ########################## X_train1 = model.transform(X_train) print(X_train1.shape[1]) #867/794 X_test1 = model.transform(X_test) print(X_test1.shape[1]) #867/794 y_train = np_utils.to_categorical(y_train) y_test = np_utils.to_categorical(y_test) return X_train1, X_test1, y_train, y_test
def read_xy(datapath): dataset = pd.read_csv(datapath) col = dataset.columns.values.tolist() col1 = col[1:] print(len(col1)) X_train = np.array(dataset[col1]) y_train = preprocessing.LabelEncoder().fit_transform(dataset['class']) print(len(y_train)) scale = StandardScaler().fit(X_train) X_train = scale.transform(X_train) f_dim = X_train.shape[1] y_train = np_utils.to_categorical(y_train) return X_train, y_train, f_dim
def read_xy(PATH): dataset = pd.read_csv(PATH) #用pandas读取原始数据 col = dataset.columns.values.tolist() #取第一行 col1 = col[1:] #取特征 print(len(col1)) #特征维数 X_train = np.array(dataset[col1]) #取数据 y_train = preprocessing.LabelEncoder().fit_transform( dataset['class']) #标签标准化 print(len(y_train)) #标准化 scale = StandardScaler().fit( X_train) #特征矩阵标准化(与距离计算无关的概率模型、与距离计算无关的基于树的模型不需要) X_train = scale.transform(X_train) #带L1/L2/L1+L2惩罚项的逻辑回归作为基模型的特征选择——SelectFromModel #小的C会导致少的特征被选择。使用Lasso,alpha的值越大,越少的特征会被选择。 ######################################针对clf.coef_:1*n_features##################################### ''' #clf=Lasso(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#Lasso回归 #clf = LassoCV() #clf=Ridge(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#岭回归 #clf=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.1,max_iter=5000,random_state=0)#弹性网络正则 clf=LinearRegression(normalize=True) clf.fit(X_train, y_train) #print(clf.coef_) importance=np.abs(clf.coef_) #print(importance) ''' ######################################针对clf.coef_:n_classes*n_features##################################### #‘newton-cg’,‘sag’和‘lbfgs’等solvers仅支持‘L2’regularization, #‘liblinear’ solver同时支持‘L1’、‘L2’regularization, #若dual=Ture,则仅支持L2 penalty。 clf = LogisticRegression(penalty='l1', C=0.1, solver='liblinear', random_state=0) #clf.coef_:n_classes*n_features #clf=LogisticRegression(penalty='l2',C=0.1,random_state=0) #clf=LR(threshold=0.5, C=0.1)#参数threshold为权值系数之差的阈值 #clf=LinearSVC(penalty='l1',C=0.1,dual=False,random_state=0) #clf=LinearSVC(penalty='l2',C=0.1,random_state=0) clf.fit(X_train, y_train) #print(clf.coef_) #每个类别--每个属性--都有一个权重,将不同类别同一属性权重相加--即为该维度的--重要程度得分 #方法一: importance = np.linalg.norm(clf.coef_, axis=0, ord=1) #方法二: #coef=np.abs(clf.coef_) #importance=np.sum(coef,axis=0) #print(importance) mean = np.mean(importance) #print(mean) #median=np.median(importance) #print(median) #model=SelectFromModel(clf,prefit=True) model = SelectFromModel(clf, prefit=True, threshold=2.0 * mean) ''' model=SelectFromModel(estimator=clf).fit(X_train, y_train) importance=model.estimator_.coef_ threshold=model.threshold_ print(threshold) ''' #threshold : 阈值,string, float, optional default None #可以使用:median 或者 mean 或者 1.25 * mean 这种格式。 #如果使用参数惩罚设置为L1,则使用的阈值为1e-5,否则默认使用用mean X_train = model.transform(X_train) f_dim = X_train.shape[1] print(f_dim) y_train = np_utils.to_categorical(y_train) return X_train, y_train, f_dim