Example #1
0
def data():
    PATH = "/ASTRAL186/train1/astral_train.csv"
    dataset = pd.read_csv(PATH)
    col = dataset.columns.values.tolist()
    col1 = col[1:]
    print(len(col1))  #2460
    X_train = np.array(dataset[col1])
    y_train = preprocessing.LabelEncoder().fit_transform(dataset['class'])
    print(len(y_train))  #5273
    scale = StandardScaler().fit(X_train)
    X_train = scale.transform(X_train)

    PATH_ = "/ASTRAL186/test1/astral_test.csv"
    dataset_ = pd.read_csv(PATH_)
    col_ = dataset_.columns.values.tolist()
    col1_ = col_[1:]
    print(len(col1_))  #2460
    X_test = np.array(dataset_[col1_])
    y_test = preprocessing.LabelEncoder().fit_transform(dataset_['class'])
    print(len(y_test))  #1319
    scale = StandardScaler().fit(X_test)
    X_test = scale.transform(X_test)

    #fs
    #clf=LogisticRegression(penalty='l1',C=0.1,solver='liblinear',random_state=0)########################edd+1.25mean(676)/tg+1.25mean(584)/astral+1.25mean(549)/astral_train+1.0*mean(867)
    clf = LinearSVC(
        penalty='l1', C=0.1, dual=False, random_state=0
    )  ########################dd+1.5mean(584)/le+mean(554)/astral_train1+1.25mean(794)
    clf.fit(X_train, y_train)
    importance = np.linalg.norm(clf.coef_, axis=0, ord=1)
    mean = np.mean(importance)
    model = SelectFromModel(clf, prefit=True,
                            threshold=1.25 * mean)  ##########################

    X_train1 = model.transform(X_train)
    print(X_train1.shape[1])  #867/794
    X_test1 = model.transform(X_test)
    print(X_test1.shape[1])  #867/794

    y_train = np_utils.to_categorical(y_train)
    y_test = np_utils.to_categorical(y_test)

    return X_train1, X_test1, y_train, y_test
Example #2
0
def read_xy(datapath):
    dataset = pd.read_csv(datapath)
    col = dataset.columns.values.tolist()
    col1 = col[1:]
    print(len(col1))
    X_train = np.array(dataset[col1])
    y_train = preprocessing.LabelEncoder().fit_transform(dataset['class'])
    print(len(y_train))
    scale = StandardScaler().fit(X_train)
    X_train = scale.transform(X_train)
    f_dim = X_train.shape[1]
    y_train = np_utils.to_categorical(y_train)
    return X_train, y_train, f_dim
Example #3
0
def read_xy(PATH):
    dataset = pd.read_csv(PATH)  #用pandas读取原始数据
    col = dataset.columns.values.tolist()  #取第一行
    col1 = col[1:]  #取特征
    print(len(col1))  #特征维数
    X_train = np.array(dataset[col1])  #取数据
    y_train = preprocessing.LabelEncoder().fit_transform(
        dataset['class'])  #标签标准化
    print(len(y_train))
    #标准化
    scale = StandardScaler().fit(
        X_train)  #特征矩阵标准化(与距离计算无关的概率模型、与距离计算无关的基于树的模型不需要)
    X_train = scale.transform(X_train)

    #带L1/L2/L1+L2惩罚项的逻辑回归作为基模型的特征选择——SelectFromModel
    #小的C会导致少的特征被选择。使用Lasso,alpha的值越大,越少的特征会被选择。
    ######################################针对clf.coef_:1*n_features#####################################
    '''
	#clf=Lasso(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#Lasso回归
	#clf = LassoCV()
	#clf=Ridge(normalize=True,alpha=0.001,max_iter=5000,random_state=0)#岭回归
	#clf=ElasticNet(normalize=True,alpha=0.001,l1_ratio=0.1,max_iter=5000,random_state=0)#弹性网络正则
	clf=LinearRegression(normalize=True)
	clf.fit(X_train, y_train)
	#print(clf.coef_)
	importance=np.abs(clf.coef_)
	#print(importance)
	'''
    ######################################针对clf.coef_:n_classes*n_features#####################################

    #‘newton-cg’,‘sag’和‘lbfgs’等solvers仅支持‘L2’regularization,
    #‘liblinear’ solver同时支持‘L1’、‘L2’regularization,
    #若dual=Ture,则仅支持L2 penalty。
    clf = LogisticRegression(penalty='l1',
                             C=0.1,
                             solver='liblinear',
                             random_state=0)  #clf.coef_:n_classes*n_features
    #clf=LogisticRegression(penalty='l2',C=0.1,random_state=0)
    #clf=LR(threshold=0.5, C=0.1)#参数threshold为权值系数之差的阈值
    #clf=LinearSVC(penalty='l1',C=0.1,dual=False,random_state=0)
    #clf=LinearSVC(penalty='l2',C=0.1,random_state=0)
    clf.fit(X_train, y_train)
    #print(clf.coef_)
    #每个类别--每个属性--都有一个权重,将不同类别同一属性权重相加--即为该维度的--重要程度得分
    #方法一:
    importance = np.linalg.norm(clf.coef_, axis=0, ord=1)
    #方法二:
    #coef=np.abs(clf.coef_)
    #importance=np.sum(coef,axis=0)
    #print(importance)

    mean = np.mean(importance)
    #print(mean)
    #median=np.median(importance)
    #print(median)

    #model=SelectFromModel(clf,prefit=True)
    model = SelectFromModel(clf, prefit=True, threshold=2.0 * mean)
    '''
	model=SelectFromModel(estimator=clf).fit(X_train, y_train)
	importance=model.estimator_.coef_
	threshold=model.threshold_
	print(threshold)
	'''
    #threshold : 阈值,string, float, optional default None
    #可以使用:median 或者 mean 或者 1.25 * mean 这种格式。
    #如果使用参数惩罚设置为L1,则使用的阈值为1e-5,否则默认使用用mean
    X_train = model.transform(X_train)
    f_dim = X_train.shape[1]
    print(f_dim)
    y_train = np_utils.to_categorical(y_train)
    return X_train, y_train, f_dim