Example #1
0
File: svm.py Project: lkprof/sema
def svm():
    #load data
    x_train,y_train=load_svmlight_file("12trainset")
    x_train.todense()
    x_test,y_test=load_svmlight_file("12testdata")
    x_test.todense()
    sk=SelectKBest(f_classif,9).fit(x_train,y_train)
    x_new=sk.transform(x_train)
    x_newtest=sk.transform(x_test)
    print(sk.scores_)
    print(x_new.shape)
    print(sk.get_support())
    #classfier
    clf=SVC(C=2,gamma=2)
    ovrclf=OneVsRestClassifier(clf,-1)
    ovrclf.fit(x_train,y_train)
    y_pred=ovrclf.predict(x_test)
    # write result
    with open("result.txt","w") as fw:
        for st in y_pred.tolist():
            fw.write(str(st)+'\n')
    print(np.array(y_pred).shape)

    target_names=['0','1','2','3']
    #result
    #sum_y = np.sum((np.array(y_pred)-np.array(y_test))**2)
    #print(classification_report(y_test,y_pred,target_names=target_names))
    #print("sougouVal: ",float(sum_y)/y_pred.shape[0])
    print(time.time()-start_time)
def add_feature_auc(X_train, y_train, X_val, y_val, classifier, cross_val, n_features):
	"""
	Purpose: add features to model one at a time (best-to-worst) and calculate performance
	Inputs:	X_train: training features
			y_train: training labels
			X_val: validation features
			y_val: validation labels
			classifier: sklearn classifier object
			cross-val: sklearn cross-validation object
			n_features: the maximum number of features to try
	"""
	cv_aucs = pd.Series(index = range(1,n_features+1))	#Empty series for outputs
	cv_stds = pd.Series(index = range(1,n_features+1))
	train_aucs = pd.Series(index = range(1,n_features+1))
	val_aucs = pd.Series(index = range(1,n_features+1))

	for i in xrange(1,n_features+1):										#Iterate over the number of features
		selector = SelectKBest(score_func = f_classif, k=i)					#Initialize selector for the i best features
		selector = selector.fit(X_train, y_train)							#Fit selector
		X_train_new = pd.DataFrame(selector.transform(X_train)) 			#Take i best training features for training
		X_val_new = pd.DataFrame(selector.transform(X_val)) 				#Take i best training features for validation
		mean_cv_auc, std_cv_auc, train_auc, val_auc = train_val_auc(		#Calculate performance
			X_train_new, y_train, X_val_new, y_val, classifier, cross_val)
		cv_aucs[i] = mean_cv_auc											#Store performance metrics
		cv_stds[i] = std_cv_auc
		train_aucs[i] = train_auc
		val_aucs[i] = val_auc

	df = pd.DataFrame(cv_aucs, columns=['cv_auc'])	#Create performance dataframe for output
	df['cv_std'] = cv_stds
	df['train_auc'] = train_aucs
	df['val_auc'] = val_aucs
	return df
Example #3
0
def choseFeature(TrainX, TrainY, TestX):
	cF = SelectKBest(chi2, k=100)
	cF.fit(TrainX, TrainY)
	check = cF.get_support()
	newTrainX = cF.transform(TrainX)
	newTestX = cF.transform(TestX)
	return (newTrainX, newTestX)
Example #4
0
File: B.py Project: Alexoner/mooc
def feature_selection(X_train,X_test,y_train):
    '''
    Try to select best features using good feature selection methods (chi-square or PMI)
    or simply you can return train, test if you want to select all features
    :param X_train: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
    :param X_test: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
    :param y_train: A dictionary with the following structure
            { instance_id : sense_id }
    :return:
    '''



    # implement your code here
    X_train_new = X_train
    X_test_new = X_test

    _y_train = map(lambda key: y_train[key], X_train.keys())
    selectorKBest = SelectKBest(chi2, k=int(len( X_train.items()[0][1])*0.90 )).fit(
        X_train.values(), _y_train)
    X_train_selected = selectorKBest.transform(X_train.values())
    X_test_selected = selectorKBest.transform(X_test.values())
    X_train_new = dict(map(lambda index_key: (X_train.keys()[index_key[0]], X_train_selected[index_key[0]]),
                        enumerate(X_train.keys())))
    X_test_new = dict(map(lambda index_key: (X_test.keys()[index_key[0]], X_test_selected[index_key[0]]),
                        enumerate(X_test.keys())))
    #return X_train_new, X_test_new
    # or return all feature (no feature selection):
    return X_train_new, X_test_new
def feature_selection(features_train,labels_train,features_test,K):
    fs = SelectKBest(f_classif,K)
    fs.fit(features_train,labels_train)

    features_train_new = fs.transform(features_train)
    features_test_new = fs.transform(features_test)
    
    return features_train_new,features_test_new
Example #6
0
def chisq(train_X, train_y, test_X, kN):
	start_time = time.time()
	ch2 = SelectKBest(chi2, k = kN)
	ch2.fit(train_X, train_y)
	train_X_ch2 = ch2.transform(train_X)
	test_X_ch2 = ch2.transform(test_X)
	print("--- %s seconds ---" % (time.time() - start_time))
	return ch2, train_X_ch2, test_X_ch2
def PredictPhysiology_AllFeatures(hdf_location, power_feature_filename, coherence_feature_filename, phys_filename, num_k_best):
	# Read in behavioral data
	BlockAB_behavior = StressBehavior(hdf_location)
	# Get classification of trial types
	trial_types = np.ravel(BlockAB_behavior.stress_type[BlockAB_behavior.state_time[BlockAB_behavior.ind_check_reward_states]])

	# Load physiology data
	ibi, pupil = TrialClassificationWithPhysiology(phys_filename, trial_types, plot_results = True)
	phys_mat = np.hstack((ibi,pupil))

	# Load neural power features
	power_mat = dict()
	coherence_mat = dict()
	sp.io.loadmat(power_feature_filename, power_mat)
	power_feat_keys = [key for key in power_mat.keys() if key[0]!='_']
	num_chan, num_conditions = power_mat[power_feat_keys[0]].shape
	num_trials = len(power_feat_keys)
	power_feat_mat = np.zeros([num_trials,num_chan*num_conditions])
	# Create power feature matrix
	for i, key in enumerate(power_feat_keys):
		power_feat_mat[i,:] = power_mat[key].flatten()

	# Create coherence feature matrix
	sp.io.loadmat(coherence_feature_filename, coherence_mat)
	coherence_feat_keys = [key for key in coherence_mat.keys() if key[0]!='_']
	num_chan_pairs, num_coh_conditions = coherence_mat[coherence_feat_keys[0]].shape
	coherence_feat_mat = np.zeros([num_trials,num_chan_pairs*num_coh_conditions])
	for i, key in enumerate(coherence_feat_keys):
		coherence_feat_mat[i,:] = coherence_mat[key].flatten()

	# matrix is in (trials) x (neural_features)
	feat_mat = np.hstack((power_feat_mat, coherence_feat_mat))
	# Split data into train and test sets
	X_train, X_test, y_train, y_test = train_test_split(feat_mat,phys_mat,test_size = 0.9, random_state = 0)

	linear_regress = linear_model.LinearRegression()
	linear_regress.fit(X_train,y_train)
	phys_pred_all = linear_regress.predict(X_test)

	#phys_pred_all_err = np.linalg.norm(phys_pred_all - y_test)
	phys_pred_all_err = np.abs(phys_pred_all - y_test)
	# Select top 3 of original neural features
	selection_k = SelectKBest(k=num_k_best)
	selection_k.fit(feat_mat,trial_types)
	X_selection_k = selection_k.transform(X_train)
	X_test_selection_k = selection_k.transform(X_test)
	linear_regress_k = linear_model.LinearRegression()
	linear_regress_k.fit(X_selection_k,y_train)
	phys_pred_k = linear_regress_k.predict(X_test_selection_k)

	#phys_pred_k_err = np.linalg.norm(phys_pred_k - y_test)
	phys_pred_k_err = np.abs(phys_pred_k - y_test)
	#plt.scatter(phys_mat[:,0], phys_mat[:,1], marker = 'o', color = 'k', label = 'original')
	#plt.scatter(phys_pred_k[:,0], phys_pred_k[:,1], marker = 'o', color = 'm', label = 'estimate with K best features')
	#plt.legend()
	#plt.show()

	return linear_regress, phys_pred_k_err, phys_pred_all_err, phys_mat
def kfold_CV(adj_matrix, folds, max_cycle_order, num_features = -1):
  num_folds = len(folds)
  accuracy_fold_data = list()
  false_positive_rate_fold_data = list()
  time_fold_data = list()
  for fold_index in range(num_folds):
    print("Fold %d:" % (fold_index + 1))

    #get data
    train_points = pipeline.join_folds(folds, fold_index)
    test_points = folds[fold_index]   
    train_test_overlap = False

    train_row_indices, train_col_indices = zip(*train_points)
    test_row_indices, test_col_indices = zip(*test_points)
    train_labels = adj_matrix[train_row_indices, train_col_indices].A[0] #array of signs of training edges
    test_labels = adj_matrix[test_row_indices, test_col_indices].A[0] #array of signs of test edges

    #construct matrix using just training edges
    train_matrix = sp.csr_matrix((train_labels, (train_row_indices, train_col_indices)), shape = adj_matrix.shape)
    train_matrix = (train_matrix + train_matrix.transpose()).sign() #make symmetric

    #Compute feature products
    #This dominates the training time, so report time for only this part for experiments
    before_train = time.time()
    feature_products = hoc.extract_edge_features(train_matrix, max_cycle_order)

    #get features and labels corresponding to each data point
    train_data = np.asarray([hoc.extract_features_for_edge(feature_products, tr_point) for tr_point in train_points])
    test_data = np.asarray([hoc.extract_features_for_edge(feature_products, te_point) for te_point in test_points])
    after_train = time.time()
    model_time = after_train - before_train

    #if, for experimental reasons, we don't want to train on all the features instead
    #as a diagnostic for what the model is actually learning and why
    if num_features > 0: #perform feature selection
      feat_sel = SelectKBest(f_classif, k=num_features)
      feat_sel.fit(train_data, train_labels)
      train_data = feat_sel.transform(train_data)
      test_data = feat_sel.transform(test_data)
    elif num_features == 0: #train on random features
      print "train data: random matrix of shape ", train_data.shape
      train_data = np.random.random(train_data.shape)

    #train logistic regression classifier
    clf = LogisticRegression()
    clf.fit(train_data, train_labels)

    #Evaluate
    test_preds = clf.predict(test_data)

    acc, fpr = pipeline.evaluate(test_preds, test_labels)
    accuracy_fold_data.append(acc)
    false_positive_rate_fold_data.append(fpr)
    print "HOC feature extraction time for one fold: ", model_time
    time_fold_data.append(model_time)

  return accuracy_fold_data, false_positive_rate_fold_data, time_fold_data
class NaiveBayesClassifier(object):
    '''
    classdocs
    '''
        
    def __init__(self):
        
        self.classifier = MultinomialNB()
        #self.model = None
        
    def trainClassifier(self, trainingDocs, labels):
        self.trainingDocs = trainingDocs
        self.labels = labels
        
        self.count_vect = CountVectorizer(stop_words='english')
        X_train_counts = self.count_vect.fit_transform(self.trainingDocs)
        self.tf_transformer = TfidfTransformer(use_idf=True,sublinear_tf=True).fit(X_train_counts)
        X_train_tf = self.tf_transformer.transform(X_train_counts)
        
        self.ch2 = SelectKBest(chi2)
        X_train = self.ch2.fit_transform(X_train_tf, self.labels)
        
        #self.classifier.fit(X_train_tf, self.labels)
        self.classifier.fit(X_train, self.labels)
        
    def classify(self, docs_new):
        X_new_counts = self.count_vect.transform(docs_new)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        X_test = self.ch2.transform(X_new_tfidf)
        #predicted = self.model.predict(X_new_tfidf)
        #self.predicted = self.classifier.predict(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        #for doc, category in zip(docs_new, self.predicted):
        #    print '%r => %s' % (doc,category)
        return self.predicted
    
    def calculate_score(self, doc_new):
        doc_list = [doc_new]
        #doc_list.append(doc_new)
        X_new_counts = self.count_vect.transform(doc_list)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        X_test = self.ch2.transform(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        return self.predicted
        #predicted_prob_all = self.classifier.predict_proba(X_test)
        #predicted_prob = [max(pr) for pr in predicted_prob_all]
        #return predicted_prob
    
    def score(self,docs_test,labels):
        X_new_counts = self.count_vect.transform(docs_test)
        X_new_tfidf = self.tf_transformer.transform(X_new_counts)
        
        X_test = self.ch2.transform(X_new_tfidf)
        #self.predicted = self.classifier.predict(X_new_tfidf)
        self.predicted = self.classifier.predict(X_test)
        accuracy = np.mean(self.predicted == labels)
        #accuracy = self.classifier.score(X_new_tfidf, labels)
        return accuracy
Example #10
0
def select_features(train_X, train_y, test_X, k):
    if k == 'all':
        return train_X, test_X

    selector = SelectKBest(chi2, k=k)
    selector.fit(train_X, train_y)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
Example #11
0
 def _select_features(self, n):
     '''Reduce X to the n best features that represent Y'''
     logging.info('Reducing X from %d features to %d.' %(self.X.shape[1],n))
     if n >= self.X.shape[1]:
         logging.warn('Number of features is greater than/equal to  n.')
     else:
         sk = SelectKBest(k=n)
         sk.fit_transform(self.X[:,1:],self.Y[:,1]) # XXX: This will look ahead to cv/test data
         sk.transform(self.X_submit[:,1:])
Example #12
0
def SelKBest1Final(X_train, X_test, y_train, y_test):
    fX_train = copy.copy(X_train)
    fX_test = copy.copy(X_test)
    fy_train = copy.copy(y_train)
    fy_test = copy.copy(y_test)
    skb = SelectKBest(f_classif, k=3)
    skb.fit(fX_train, fy_train)
    fX_train = skb.transform(fX_train)
    fX_test = skb.transform(fX_test)    
    return fX_train, fX_test, fy_train, fy_test
Example #13
0
def SelKBest_base_final(X_train, X_test, y_train, y_test, k=10):
    '''Leave the copying alone here'''
    fX_train = copy.copy(X_train)
    fX_test = copy.copy(X_test)
    fy_train = copy.copy(y_train)
    fy_test = copy.copy(y_test)
    skb = SelectKBest(f_classif, k=k)
    skb.fit(fX_train, fy_train)
    fX_train = skb.transform(fX_train)
    fX_test = skb.transform(fX_test)  
    return fX_train, fX_test, fy_train, fy_test
Example #14
0
def pre_process_pruned_tfidf(*args):
  x_train = args[0]
  y_train = args[1]
  x_test = args[2]
  y_test = args[3]
  x_train = [x.vectorized for x in x_train]
  x_test = [x.vectorized for x in x_test]
  k = int(round(K_BEST_RATE * len(x_train[0]), ))
  k_best = SelectKBest(chi2, k=k)
  k_best.fit(x_train, y_train)
  train_transformed = k_best.transform(x_train)
  test_transformed = k_best.transform(x_test)
  return train_transformed, y_train, test_transformed, y_test
Example #15
0
def feature_selection(X_train,X_test,y_train, language):
    '''
    Try to select best features using good feature selection methods (chi-square or PMI)
    or simply you can return train, test if you want to select all features
    :param X_train: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
    :param X_test: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
    :param y_train: A dictionary with the following structure
            { instance_id : sense_id }
    :return:
    '''

    # implement your code here

    #return X_train_new, X_test_new
    # or return all feature (no feature selection):

    if language != 'English':
        X = [] # our list of word counts for an instance
        Y = [] # the corresponding sense id for each instance

        for key, value in X_train.iteritems():
            if y_train[key] != 'U':
                Y.append(y_train[key])
                X.append(value)

        #print len(X), len(X[0])
        num_feats = 0.9 * len(X[0])
        feature_selector = SelectKBest(chi2, k=num_feats)
        feature_selector.fit(X, Y)

        X_train_final = {}
        X_test_final = {}

        for instance_id in X_train:
            X_train_final[instance_id] = feature_selector.transform(X_train[instance_id])[0]
            #print len(X_train[instance_id])

        for instance_id in X_test:
            X_test_final[instance_id] = feature_selector.transform(X_test[instance_id])[0]

        return X_train_final, X_test_final

    else:
        # given return statement
        return X_train, X_test
def itemB():
    train_dataset = load_nebulosa_train()
    # remover missing values
    # print(train_dataset)
    train_dataset = train_dataset[~np.isnan(train_dataset).any(axis=1)]
    train_dataset = train_dataset[:, 2:]

    train_target = train_dataset[:, -1]
    train_dataset = train_dataset[:, :-2]

    # train_dataset = normalize(train_dataset, axis=0)

    test_dataset = load_nebulosa_test()
    # remover mising values
    test_dataset = test_dataset[~np.isnan(test_dataset).any(axis=1)]
    test_dataset = test_dataset[:, 2:]

    test_target = test_dataset[:, -1]
    test_dataset = test_dataset[:, :-2]
    # print(test_dataset)
    # test_dataset = normalize(test_dataset, axis=1)
    # print(test_dataset)

    kbest = SelectKBest(f_classif, k=3).fit(train_dataset, train_target)
    train_dataset = kbest.transform(train_dataset)
    test_dataset = kbest.transform(test_dataset)

    # print(train_dataset)

    n_train_samples = train_dataset.shape[0]
    n_train_features = train_dataset.shape[1]
    # print("Nebulosa Train dataset: %d amostras(%d características)" % (n_train_samples, n_train_features))

    n_test_samples = test_dataset.shape[0]
    n_test_features = test_dataset.shape[1]
    # print("Nebulosa Test dataset: %d amostras(%d características)" % (n_test_samples, n_test_features))

    nn = KNeighborsClassifier(n_neighbors=1)
    nn.fit(train_dataset, train_target)
    nn_target_pred_test = nn.predict(test_dataset)

    nn_accuracy_test = accuracy_score(test_target, nn_target_pred_test)
    print("NN: Acurácia (Teste): %.2f" % (nn_accuracy_test))

    nc = NearestCentroid(metric="euclidean")
    nc.fit(train_dataset, train_target)
    nc_target_pred_test = nc.predict(test_dataset)

    nc_accuracy_test = accuracy_score(test_target, nc_target_pred_test)
    print("Rocchio: Acurácia (Teste): %.2f" % (nc_accuracy_test))
Example #17
0
 def feature_selection_tech(predictors, responses, test_predictors, selectFeatTech):
     if(selectFeatTech==0):
         t=int(predictors.shape[1]*0.40);
         t=40;
         model = SelectKBest(chi2, k=t).fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     if(selectFeatTech==1):
         randomized_logistic = RandomizedLogisticRegression();
         model = randomized_logistic.fit(predictors, responses);
         predictors_new = model.transform(predictors);
         predictors_test_new = model.transform(test_predictors);
         indices = model.get_support(indices=True);
     return predictors_new, predictors_test_new, indices;
Example #18
0
    def _SelectKBest(self, X, y):

        print('Selecting K Best from whole image')

        from sklearn.feature_selection import SelectKBest, f_classif

        # ### Define the dimension reduction to be used.
        # Here we use a classical univariate feature selection based on F-test,
        # namely Anova. The number of features to be selected is set to 784
        feature_selection = SelectKBest(f_classif, k=self.k_features)

        feature_selection.fit(X, y)

        scores = f_classif(X, y)[0]
        mask_k_best = np.zeros(scores.shape, dtype=bool)
        mask_k_best[np.argsort(scores, kind="mergesort")[-self.k_features:]]\
            = 1
        import nibabel
        mask_brain_img = nibabel.load(self.mask_non_brain).get_data()
        mask_brain = mask_brain_img.flatten().astype(bool)

        roi = np.zeros(mask_brain.flatten().shape)
        roi[mask_brain] = mask_k_best
        roi = roi.reshape(mask_brain_img.shape)

        img = nibabel.Nifti1Image(roi, np.eye(4))
        img.to_filename('/tmp/best.nii.gz')

        print('SelectKBest data reduction from: %s' % str(X.shape))
        X = feature_selection.transform(X)
        print('SelectKBest data reduction to: %s' % str(X.shape))

        self.feature_reduction_method = feature_selection

        return X
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf
Example #20
0
class BagOfWords(Feature):
	
	def name(self):
		return "BagOfWords with mn=" + str(self._mn) + ", mx=" + str(self._mx) + ", analyzertype=" + self._analyzertype + ", numFeatures=" + str(self._numFeatures)
		
	def __init__(self,numFeatures, mn=1, mx=2, analyzertype='word'):
		self._tokenizer = Tokenizer()	
		if analyzertype == 'word':
			self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype)
		else:
			self._vectorizer = TfidfVectorizer(ngram_range=(mn,mx),analyzer=analyzertype)
		self._initialized = False
		self._mn = mn
		self._mx = mx
		self._analyzertype = analyzertype
		self._numFeatures = numFeatures
		self._ch2 = SelectKBest(chi2, k=numFeatures)

	def extract_all(self, sentences,train,labels):
		sentences = self.preprocess_all(sentences)
		if not self._initialized:
			matrix = self._vectorizer.fit_transform(sentences)
			self._initialized = True
		else:
			matrix = self._vectorizer.transform(sentences)
		#print matrix.todense()
		if self._numFeatures < matrix.shape[1]:
			if train:
				matrix = self._ch2.fit_transform(matrix, labels)
			else:
				matrix = self._ch2.transform(matrix)
		return matrix
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def crossvalidate(nrep, nfold, sparseArrayRowNorm, y_all, clf, accuMeasure, selection):
    nsample=sparseArrayRowNorm[0].shape[0]
    scaler = StandardScaler(with_mean=False)
    #scaler = MinMaxScaler()
    testsize=int(nsample/nfold)
    cvIdx=[1]*(nsample-testsize)+[2]*testsize
    random.seed(100)
    aucRes=[]
    for nn in range(nrep):
        #print nn
        random.shuffle(cvIdx)
        Y_train=y_all[np.where(np.array(cvIdx)==1)[0]]
        Y_test=y_all[np.where(np.array(cvIdx)==2)[0]]
        X_train_all=[]
        X_test_all=[]
        for ii in xrange(len(sparseArrayRowNorm)):
            varSelector = SelectKBest(f_classif, k=min(int(nsample*0.7), sparseArrayRowNorm[ii].shape[1]))
            X_train=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==1)[0],:]
            X_train =varSelector.fit_transform(X_train, Y_train)
            X_train_all=X_train_all+[X_train]
            X_test=sparseArrayRowNorm[ii][np.where(np.array(cvIdx)==2)[0],:]
            X_test= varSelector.transform(X_test)
            X_test_all=X_test_all+[X_test]
        X_train=hstack(X_train_all,format='csr')
        X_test=hstack(X_test_all,format='csr')
        del X_train_all
        del X_test_all
        aucRes.append(sigle_fit(clf, X_train, Y_train, X_test, Y_test, accuMeasure))
    print np.array(aucRes).mean()
    return np.array(aucRes).mean()
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    # selector = SelectPercentile(f_classif, percentile=30)
    # selector.fit(features_train_transformed, lables)

    # selector : SelectKBest
    selector = SelectKBest(k=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
def string_selection():
    # get data
    vectorizer = CountVectorizer(decode_error='ignore')
    ch2 = SelectKBest(chi2, k=100)

    # get data
    train_data, permission_list = db_tool.get_new_train_data()
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    # feature extraction
    x_train = vectorizer.fit_transform(x_train)
    feature_names = vectorizer.get_feature_names()

    x_train = ch2.fit_transform(x_train, y_train)
    feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
    print(ch2.scores_)
    print(ch2.get_support(indices=True))
    print(feature_names)
    x_test = vectorizer.transform(x_test)
    x_test = ch2.transform(x_test)

    # # build the model
    model = MultinomialNB().fit(x_train, y_train)
    #
    # # valid the model
    predicted = model.predict(x_test)
    print (metrics.accuracy_score(y_test, predicted))
Example #25
0
 def univariate_features_selection(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
     selector = SelectKBest(chi2, k=10)
     selector = selector.fit(x, y)
     selected_features = self.features[selector.get_support()]
     print(selected_features)
     x = selector.transform(x)
     return x
Example #26
0
def do_training():
    global X_train, X_test, feature_names, ch2
    print("Extracting features from the training data using a sparse vectorizer")
    t0 = time()
    if opts.use_hashing:
        vectorizer = HashingVectorizer(stop_words='english', non_negative=True,
                                       n_features=opts.n_features)
        X_train = vectorizer.transform(data_train_data)
    else:
        vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.25,
                                     stop_words='english')
        X_train = vectorizer.fit_transform(data_train_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_train.shape)
    print()

    print("Extracting features from the test data using the same vectorizer")
    t0 = time()
    X_test = vectorizer.transform(data_test_data)
    duration = time() - t0
    #print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
    print("n_samples: %d, n_features: %d" % X_test.shape)
    print()

    # mapping from integer feature name to original token string
    if opts.use_hashing:
        feature_names = None
    else:
        feature_names = vectorizer.get_feature_names()

    if True:#opts.select_chi2:
        print("Extracting %d best features by a chi-squared test" % 20000)
        t0 = time()
        ch2 = SelectKBest(chi2, k=20000)
        X_train = ch2.fit_transform(X_train, y_train)
        X_test = ch2.transform(X_test)
        if feature_names:
            # keep selected feature names
            feature_names = [feature_names[i] for i
                             in ch2.get_support(indices=True)]
        print("done in %fs" % (time() - t0))
        print()
    
    if feature_names:
        feature_names = np.asarray(feature_names)

    results = []

    #for penalty in ["l2", "l1"]:
    penalty = 'l2'
    print('=' * 80)
    print("%s penalty" % penalty.upper())
    # Train Liblinear model
    clf = LinearSVC(loss='l2', penalty=penalty,dual=False, tol=1e-3)
    results.append(benchmark(clf))
        
    joblib.dump(vectorizer, 'vectorizer.pkl', compress=9)
    joblib.dump(ch2, 'feature_selector.pkl', compress=9)
    joblib.dump(clf, 'linearsvc_classifier.pkl', compress=9)
def gridSearchCV_test():
    ch2 = SelectKBest(chi2, k=20)

    # get data
    train_data = db_tool.get_new_train_data()
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data['permission-data'],
                                                                         train_data['target'], test_size=0.2,
                                                                         random_state=1)

    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)

    param_grid = [
        {'alpha': [1, 0.4, 10], 'fit_prior': [True, False]},
        {'alpha': [0, 9, 0.4], 'fit_prior': [True]}
    ]
    clf = grid_search.GridSearchCV(MultinomialNB(), param_grid)
    # # build the model
    clf.fit(X_train, y_train)

    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))

    predicted = clf.predict(X_test)
    print (metrics.accuracy_score(y_test, predicted))
    print(metrics.classification_report(y_test, predicted))
def try_all_k_best(max=13):
    for k in range(1,max+1):
        data = featureFormat(my_dataset, features_list, sort_keys = True)
        labels, features = targetFeatureSplit(data)
        features_train, features_test, labels_train, labels_test = \
            train_test_split(features, labels, test_size=0.3, random_state=42)
        selector = SelectKBest(k=k)
        features_train = selector.fit_transform(features_train, labels_train)
        features_test = selector.transform(features_test)
        choices.append(selector.transform(np.array(features_list[1:]).reshape(1, -1)))
        lr_cv = LogisticRegressionCV()
        lr_cv.fit(features_train, labels_train)
        pred.append(lr_cv.predict(features_test))
        acc.append(accuracy_score(labels_test, pred[k-1]))
        prec.append(precision_score(labels_test, pred[k-1]))
        reca.append(recall_score(labels_test, pred[k-1]))     
 def inner(*args, **kwargs):
     X, y = func(*args, **kwargs)
     global q4_slct
     if q4_slct is None:
         q4_slct = SelectKBest(k=200).fit(X, y)
     X = q4_slct.transform(X)
     return X, y
Example #30
0
def tfidf_classify(user):
    train_set, y, src, test_set = extract_data(user.id)
    if not train_set:
        return []
    # Analyse using tf-idf
    # vector = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    vector = HashingVectorizer(n_features=1000, non_negative=True, stop_words='english')
    # List of topic extracted from text
    # feature_names = vector.get_feature_names()
    # print feature_names
    xtrain = vector.transform(train_set)
    xtest = vector.transform(test_set)

    # Select sample using chi-square
    ch2 = SelectKBest(chi2)
    xtrain = ch2.fit_transform(xtrain, y)
    xtest = ch2.transform(xtest)

    # Predict testing set
    # classifier = DecisionTreeClassifier()
    classifier = KNeighborsClassifier(n_neighbors=4)
    classifier = classifier.fit(xtrain, y)
    result = classifier.predict(xtest)
    final = []
    for i in xrange(len(result)):
        if result[i]:
            final.append(src[i])
    print len(final)
    return final
def select_features(X_train, y_train, X_test):
    fs = SelectKBest(score_func=chi2, k='all')
    fs.fit(X_train, y_train)
    X_train_fs = fs.transform(X_train)
    X_test_fs = fs.transform(X_test)
    return X_train_fs, X_test_fs, fs
Example #32
0
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
chi2_selector = SelectKBest(chi2, k=3000000)
X_kbest_train = chi2_selector.fit_transform(x_train_normalized, y_train)

from sklearn.svm import SVC
clf = SVC(C=1000, kernel='linear', random_state=0)
clf.fit(X_kbest_train, np.ravel(y_train))

########################valid

x_val_counts = count_vect.transform(X_val)
x_val_tfidf = tfidf_transformer.transform(x_val_counts)
x_val_normalized = normalizer_tranformer.transform(x_val_tfidf)
X_kbest_val = chi2_selector.transform(x_val_normalized)

val_predict_sck_svm = clf.predict(X_kbest_val)
np_y_val_predict_sck_svm = np.array(val_predict_sck_svm).reshape(
    len(val_predict_sck_svm), 1)

from sklearn.metrics import accuracy_score
accuracy_score(y_val, np_y_val_predict_sck_svm)

#######################final train before submission

count_vect_final = CountVectorizer(ngram_range=(1, 3)).fit(training_x)
x_train_counts_final = count_vect_final.transform(training_x)

tfidf_transformer_final = TfidfTransformer().fit(x_train_counts_final)
x_train_tfidf_final = tfidf_transformer_final.transform(x_train_counts_final)
Example #33
0
def main():
    corpus, labels = get_data()  # Get data
    print("data number:", len(labels))
    train_corpus, development_corpus, test_corpus, train_labels, development_labels, test_labels = prepare_datasets(
        corpus,
        labels)  # Divided into training set, development set and test set
    print('train_corpus number:', len(train_corpus))
    print('development_corpus number:', len(development_corpus))
    print('test_corpus number:', len(test_corpus))

    # FeatureUnion is used for feature extraction.
    features_select = FeatureUnion([
        ('tf', CountVectorizer(ngram_range=(1, 1))),  # term frequency
        ('bow,', CountVectorizer(
            ngram_range=(2, 2))),  # Bag-of-words model(Divide every two words)
        ('tfidf', TfidfVectorizer())
    ])  # TF-IDF
    # Feature extraction of all data set
    train_feature = features_select.fit_transform(train_corpus)
    development_feature = features_select.transform(development_corpus)
    test_feature = features_select.transform(test_corpus)

    # SVM Model
    svm = SGDClassifier(loss='hinge', n_iter_no_change=100)

    # Feature selection using development set
    k_list = [500, 1000, 2000, 5000]
    for k in k_list:
        fs_sentanalysis = SelectKBest(chi2, k=k).fit(train_feature,
                                                     train_labels)
        new_train_feature = fs_sentanalysis.transform(train_feature)
        new_development_feature = fs_sentanalysis.transform(
            development_feature)
        new_test_feature = fs_sentanalysis.transform(test_feature)
        print('k=', k, "SVM model")
        svm_model = svm.fit(new_train_feature, train_labels)
        label_pred = svm_model.predict(new_development_feature)
        score = classification_report(development_labels,
                                      label_pred,
                                      target_names=[
                                          'business', 'entertainment',
                                          'politics', 'sprot', 'tech'
                                      ])
        print(score)

    # test model
    test_k = int(
        input("Please input the K value with the best training effect:"))
    print('The score of the model on the test set:')
    fs_sentanalysis = SelectKBest(chi2, k=test_k).fit(train_feature,
                                                      train_labels)
    new_train_feature = fs_sentanalysis.transform(train_feature)
    new_test_feature = fs_sentanalysis.transform(test_feature)
    svm_model = svm.fit(new_train_feature, train_labels)
    label_pred = svm_model.predict(new_test_feature)
    score_1 = classification_report(test_labels,
                                    label_pred,
                                    target_names=[
                                        'business', 'entertainment',
                                        'politics', 'sprot', 'tech'
                                    ])
    print(score_1)
class Tfidf_transform(pl.Feature_transform):
    """create TF-IDF (term frequency - inverse document frequency) features. 

    can use chi-squared test to limit features. Assumes string based input feature that can be split.
    Uses scikit-learn based transformers internally

    Args:
        min_df (int): min document frequency (for sklearn vectorizer)

        max_df (float): max document frequency (for sklearn vectorizer)

        select_features (bool): use chi-squared test to select features

        topn_features (int): keep top features from chi-squared test

        stop_words (str): stop words (for sklearn vectorizer)

        target_feature (str): target feature for chi-squared test
    """
    def __init__(self,
                 min_df=10,
                 max_df=0.7,
                 select_features=False,
                 topn_features=50000,
                 stop_words=None,
                 target_feature=None):
        super(Tfidf_transform, self).__init__()
        self.min_df = min_df
        self.max_df = max_df
        self.select_features = select_features
        self.topn_features = topn_features
        self.stop_words = stop_words
        self.target_feature = target_feature
        self.ch2 = ""
        self.feature_names_support = []

    def getTokens(self, j):
        """basic method to get "document" string from feature
        """
        if self.input_feature in j:
            if isinstance(j[self.input_feature], list):
                return " ".join([
                    i if isinstance(i, basestring) else str(i)
                    for i in j[self.input_feature]
                ])
            elif isinstance(j[self.input_feature], basestring):
                return j[self.input_feature]
            else:
                return str(j[self.input_feature])
        else:
            return ""

    def get_models(self):
        return super(Tfidf_transform, self).get_models() + [
            (self.min_df, self.max_df, self.select_features,
             self.topn_features, self.stop_words, self.target_feature),
            self.vectorizer, self.tfidf_transformer, self.ch2, self.fnames,
            self.feature_names_support
        ]

    def set_models(self, models):
        models = super(Tfidf_transform, self).set_models(models)
        (self.min_df, self.max_df, self.select_features, self.topn_features,
         self.stop_words, self.target_feature) = models[0]
        self.vectorizer = models[1]
        self.tfidf_transformer = models[2]
        self.ch2 = models[3]
        self.fnames = models[4]
        self.feature_names_support = models[5]

    def fit(self, objs):
        """fit using sklean transforms

        vectorizer->tfidf->(optional) chi-squqred test
        """
        docs = []
        target = []
        self.vectorizer = CountVectorizer(min_df=self.min_df,
                                          max_df=self.max_df,
                                          stop_words=self.stop_words)
        self.tfidf_transformer = TfidfTransformer()
        for j in objs:
            docs.append(self.getTokens(j))
            if self.target_feature:
                target.append(int(j[self.target_feature]))
        counts = self.vectorizer.fit_transform(docs)
        self.tfidf = self.tfidf_transformer.fit_transform(counts)
        self.fnames = self.vectorizer.get_feature_names()
        self.logger.info("%s base tfidf features %d", self.get_log_prefix(),
                         len(self.fnames))
        if self.select_features:
            self.ch2 = SelectKBest(chi2, k=self.topn_features)
            self.ch2.fit_transform(self.tfidf, target)
            self.feature_names_support = set(
                [self.fnames[i] for i in self.ch2.get_support(indices=True)])
            self.logger.info("%s selected tfidf features %d",
                             self.get_log_prefix(),
                             len(self.feature_names_support))

    def transform(self, j):
        """transform features into final tfidf features
        """
        docs = []
        docs.append(self.getTokens(j))
        counts = self.vectorizer.transform(docs)
        self.tfidf = self.tfidf_transformer.transform(counts)
        if self.select_features:
            self.ch2.transform(self.tfidf)
        doc_tfidf = {}
        for (col, val) in zip(self.tfidf[0].indices, self.tfidf[0].data):
            fname = self.fnames[col]
            if self.select_features:
                if fname in self.feature_names_support:
                    doc_tfidf[fname] = val
            else:
                doc_tfidf[fname] = val
        j[self.output_feature] = doc_tfidf
        return j
Example #35
0
import numpy as np
import pandas as pd
import scipy.stats as ss
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel

df = pd.DataFrame({
    'A': ss.norm.rvs(size=10),
    'B': ss.norm.rvs(size=10),
    'C': ss.norm.rvs(size=10),
    'D': np.random.randint(low=0, high=2, size=10)
})
print(df)

x = df.loc[:, ['A', 'B', 'C']]
y = df.loc[:, 'D']

skb = SelectKBest(k=2)  # 采用过滤思想,可以指定方式,默认是采用F-检验(方差检验)
skb.fit(x, y)
print(skb.transform(x))

rfe = RFE(estimator=SVR(kernel='linear'), n_features_to_select=2,
          step=1)  # 采用包裹思想,指定回归模型,选择后剩余的特征数,以及每次迭代去除的特征量
print(rfe.fit_transform(x, y))

sfm = SelectFromModel(estimator=DecisionTreeRegressor(),
                      threshold=0.1)  # 采用嵌入思想,threshold表示特征权重低于多少时就被去除
print(sfm.fit_transform(x, y))
Example #36
0
normalization = "minmax"
bal = "smote"
df = datapp.preprocess(data,
                       to_clf,
                       normalization=normalization,
                       ignore_classes=categoric,
                       as_df=True)
df = data
y: np.ndarray = df[to_clf].values
X: np.ndarray = df.drop(to_clf, axis=1).values
#%%
select = SelectKBest(f_classif, k=10).fit(X, y)
ind = select.get_support(indices=True)
col = df.columns[ind].tolist()

X_new = select.transform(X)
dfk = pd.DataFrame(X_new, columns=col)
#%%
bins = list(range(3, 12))
qdfs = []
cdfs = []
for b in bins:
    qdfs.append(eval.cut(dfk, b, ['class', 'id', 'gender'], cut="qcut"))
    cdfs.append(eval.cut(dfk, b, ['class', 'id', 'gender'], cut="cut"))
#%%
dummy_qdfs = []
dummy_cdfs = []
for i in range(len(bins)):
    dummy_qdfs.append(eval.dummy(qdfs[i], ['class', 'id', 'gender']))
    dummy_cdfs.append(eval.dummy(cdfs[i], ['class', 'id', 'gender']))
#%%
Example #37
0
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [
            feature_names[i] for i in ch2.get_support(indices=True)
        ]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."
                            intercept_scaling=1,
                            class_weight=None,
                            random_state=None,
                            solver='liblinear',
                            max_iter=5000,
                            multi_class='ovr',
                            verbose=1,
                            warm_start=False,
                            n_jobs=256)

#creat lr stacking features
#gender
from sklearn.feature_selection import SelectKBest, chi2, f_classif
ch2 = SelectKBest(chi2, k=100000)
x_train = ch2.fit_transform(X_train_tf, train.gender)
x_test = ch2.transform(X_test_tf)

print 'creat gender prob features'
random_seed = 2016
x = x_train
y = [1 if i == 1 else 0 for i in train.gender]
skf = StratifiedKFold(y, n_folds=5, shuffle=True)

new_train = np.zeros((100000, 1))
new_test = np.zeros((100000, 1))

for i, (trainid, valid) in enumerate(skf):
    print 'fold' + str(i)
    train_x = x_train[trainid]
    train_y = train.gender[trainid]
    val_x = x_train[valid]
Example #39
0
	if(row['MemberCardLevel_Medium'] == 1):
		return 1
	elif(row['MemberCardLevel_High'] == 1):
		return 2
	else:
		return 0

#read data
data=[]
with open('dataWithLable.pkl', 'rb') as f:
    data = pickle.load(f)

#merge 3 type of carde level
X=data
X["cardlevel"]=X.apply(search, axis=1)
X=data.drop(['is_Churn', 'UUID', 'OnlineMemberId', 'MemberCardLevel_Low', 'MemberCardLevel_Medium', 'MemberCardLevel_High'], axis=1)
y=data['is_Churn']

#最高的k個features
selector= SelectKBest(k=5)
selector.fit(X, y)
GetSupport= selector.get_support(True)
TransX= selector.transform(X)
Scores= selector.scores_
print(Scores)#看全部features分數,順序和column順序相同
print(GetSupport)#有被選到的features的index

#有被選到的features
for i in range(0, 27):
	if selector.get_support()[i]:
		print(X.columns.values.tolist()[i])
		train_attrs.append(attr)
		cnt += 1

	del train_attrs[0]

	# get y_train from train_attrs
	y_train = [[float(attr)] for attr in train_attrs]

	# chi-2 select features
	print "start feature selection"
	if (SELECTOR == 0):
		selector = SelectKBest(chi2, k = K_FOR_BEST)
	else:
		selector = SelectPercentile(score_func=chi2, percentile=SELECT_PERCENTILE)
	selector.fit(x_train, y_train)
	new_x_train = selector.transform(x_train)
	print "feature selection done"

	# convert y_train to svm-fit shape
	y_train = [attr[0] for attr in y_train]

	new_x_train, new_x_test, new_y_train, new_y_test = cross_validation.train_test_split(new_x_train, y_train, test_size=0.4, random_state=0)

	# regression
	# clf = svm.SVR(kernel='rbf', degree=3, gamma=1.9, coef0=0.0, tol=0.001, \
	# 	C=0.13, epsilon=0.1, shrinking=True, probability=False, cache_size=700, \
	# 	verbose=False, max_iter=-1, random_state=None)
	clf = LinearRegression()
	clf = clf.fit(new_x_train, new_y_train)

	# cross validation
#'ONS', 'year', 'admitted', 'All_Under_16', 'All_16_24', 'All_25_34',
#'All_35_44', 'All_45_54', 'All_55_64', 'All_65_74', 'All_75_Over',
#'income_m', 'income_f', 'ft', 'pt', 'Food', 'all_ethnic',

non_food_features = [
    'income', 'cycling', 'fp_rate', 'White',
    'Gypsy / Traveller / Irish Traveller', 'Mixed / Multiple Ethnic Groups',
    'Asian / Asian British: Indian', 'Asian / Asian British: Pakistani',
    'Asian / Asian British: Bangladeshi', 'Asian / Asian British: Chinese',
    'Asian / Asian British: Other Asian',
    'Black / African / Caribbean / Black British', 'Other Ethnic Group'
]

selector = SelectKBest(f_regression, k=7)  #Regresion selector
selector.fit(df[non_food_features], target)
selector.transform(df[non_food_features])
weight = -np.log10(selector.pvalues_)
plt.bar(range(len(non_food_features)), weight)
plt.xticks(range(len(non_food_features)),
           non_food_features,
           rotation="vertical")
plt.show()

food_feats1 = [
    'Bread, rice and cereals', 'Pasta products', 'Buns, cakes, biscuits etc',
    'Pastry (savoury)', 'Beef (fresh, chilled or frozen)',
    'Pork (fresh, chilled or frozen)', 'Lamb (fresh, chilled or frozen)',
    'Poultry (fresh, chilled or frozen)', 'Bacon and ham',
    'Other meat and meat preparations', 'Fish and fish products', 'Milk',
    'Cheese and curd', 'Eggs', 'Other milk products', 'Butter',
    'Margarine, other vegetable fats and peanut butter',
import pandas as pd
from sklearn.feature_selection import SelectKBest, SelectPercentile

#Exhaustive list of feature selection we can apply

dev = [6, 9, 12, 15, 18, 20, 25, 30]

test = pd.read_csv("../input/test_data_processed.csv")
train = pd.read_csv("../input/train_data_processed.csv")
target = pd.read_csv("../input/target_data_processed.csv")

# sp_25 = SelectPercentile(percentile=25) #Can be used
# sp_50 = SelectPercentile(percentile=50) #Can be used
kbest = SelectKBest(k=100)
#RFE can also be used but generally very time consuming when feature size is high

kbest.fit(train, target)

train_k = kbest.transform(train)
test_k = kbest.transform(test)
print train_k.shape

filename_train_k = "../input/train_k100" + ".csv"
filename_test_k = "../input/train_k100" + ".csv"

pd.DataFrame(train_k).to_csv(filename_train_k, index=False)
pd.DataFrame(test_k).to_csv(filename_test_k, index=False)
trans_features_test = vectorizer.transform(features_test)
myprint('Vectorized features')
myprint('> ' + str(trans_features_train.shape[0]) + ' abstracts in train set')
myprint('> ' + str(trans_features_train.shape[1]) +
        ' words per abstract in train set')
myprint('> ' + str(trans_features_test.shape[0]) + ' abstracts in test set')
myprint('> ' + str(trans_features_test.shape[1]) +
        ' words per abstract in test set')

### Reduce feature dimensionality
# Set a sensible upper bound of the number of words that is
# required to capture the difference between the two categories
feature_dim = 1000
selector = SelectKBest(chi2, k=feature_dim)
selector.fit(trans_features_train, labels_train)
trans_features_train = selector.transform(trans_features_train).toarray()
trans_features_test = selector.transform(trans_features_test).toarray()
myprint('Reduced dimensionality')
myprint('> ' + str(trans_features_train.shape[1]) + ' words per abstract')

### Classification with support vector machine
# Pro: effective in high-dimensional feature spaces (i.e. large dictionary)
#
# Idea:
# Given training vectors x_i in R^p for i=1,...,n in two classes
# and a vector y in {-1,1}^n, the goal of SVM is to find w in R^p
# and b in R, such that the prediction sign(w^T.phi(x)+b)
# is correct for most samples
#
# Problem:
# min_{w,b,z} 1/2 w^T w + C sum_i z_i
#get words selected as feauters
feature_names = vectorizer.get_feature_names()

n_train = len(X_train)
n_test = len(X_test)

X_train = X_train[:n_train]
y_train = y_train[:n_train]
X_test = X_test[:n_test]
y_test = y_test[:n_test] 

from sklearn.feature_selection import SelectKBest, chi2
#feature selection with chi2 statistics
ch2 = SelectKBest(chi2, k=7000)
X_train_new = ch2.fit_transform(X_train, y_train)
X_test_new = ch2.transform(X_test)
X_train_new.shape
X_test_new.shape

feature_names_ch2 = [feature_names_chi2[i] for i in ch2.get_support(indices=True)]


#first we do gridsearch over hyperparameters of SVM classifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
#the grids of parameters
parameters = {'kernel':['rbf'], 'C':[10, 100], 'gamma': [0.5, 0.75, 1]}
svc = svm.SVC(gamma="scale")
#define classifier
clf = GridSearchCV(svc, parameters, cv=5, n_jobs=4, verbose=True)
clf.fit(X_train, y_train)
                    y_cur = train_finalMerged[['Activity']].values.ravel()
                    X_cur = train_finalMerged[['MeanSM','StDevSM','MdnSM', 'belowPer25SM','belowPer75SM', 'TotPower_0.3_15','FirsDomFre_0.3_15','PowFirsDomFre_0.3_15','SecDomFre_0.3_15','PowSecDomFre_0.3_15','FirsDomFre_0.6_2.5','PowFirsDomFre_0.6_2.5','FirsDomFre_per_TotPower_0.3_15','MeanSM_s2','StDevSM_s2','MdnSM_s2', 'belowPer25SM_s2','belowPer75SM_s2', 'TotPower_0.3_15_s2','FirsDomFre_0.3_15_s2','PowFirsDomFre_0.3_15_s2','SecDomFre_0.3_15_s2','PowSecDomFre_0.3_15_s2','FirsDomFre_0.6_2.5_s2','PowFirsDomFre_0.6_2.5_s2','FirsDomFre_per_TotPower_0.3_15_s2']]
                    if First:
                        X_train = X_cur
                        y_train = y_cur
                        First = False
                    else:
                        X_train = np.concatenate((X_train,X_cur),axis=0)
                        y_train = np.concatenate((y_train,y_cur),axis=0)

            # print(X_test.shape,y_test.shape, X_train.shape, y_train.shape)
            # print (siteStr,X_test)

            # feature selection
            X_train_red = selector.fit_transform(X_train,y_train)
            X_test_red = selector.transform(X_test)

            # rfc = RandomForestClassifier(n_jobs=-1, n_estimators=35, criterion='gini',oob_score = False)
            #rfc = OneVsRestClassifier(SVC(kernel='rbf',C=100,gamma=0.1))
            # rfc = SVC(kernel='rbf', gamma=0.7,C=1,random_state=10)
            rfc = KNeighborsClassifier(n_neighbors=11,algorithm='auto')
            rfc.fit(X_train_red, y_train)
            y_pred = rfc.predict(X_test_red)

            cm = confusion_matrix(y_test, y_pred, labels = activities)
            cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            cm_comb[count,:,:] = cm_normalized

            pre_rec_fscore = precision_recall_fscore_support(y_test, y_pred, average=None,labels=activities)
            tmp[count,0,:],tmp[count,1,:],tmp[count,2,:] = pre_rec_fscore[0],pre_rec_fscore[1],pre_rec_fscore[2]
            count = count + 1
Example #46
0
class FBCSP():
    '''
    Two- or multi-class FBCSP
    '''
    def __init__(self, 
                bands, 
                smp_freq, 
                num_class=2, 
                order=5, 
                n_components=4, 
                n_features=8):
        self.bands = bands
        self.smp_freq = smp_freq
        self.n_components = n_components
        self.n_features = n_features
        self.cov_type = 'epoch'
        self.order = order
        self.n_classes = num_class
        self.classes = None
        if self.n_classes == 2:
            self.component_order = 'alternate'
        else:
            self.component_order = 'mutual_info'
        self.norm_trace = False
        self.spatial_transform = [None for i in range(len(bands))]
        self.selector = SelectKBest(score_func=mutual_info_classif, k=self.n_features)
        
    def __covariance(self, X):
#         covariance = np.dot(X, X.T)/X.shape[0]
#         return covariance
        return np.dot(X,X.T)/np.trace(np.dot(X,X.T))
   
    # spatialFilter returns the spatial filter for mean covariance matrices of two classes 
    def __get_spatial_filter(self, covs, sample_weights):
        eigen_vectors, eigen_values = self.__decompose_covs(covs, sample_weights)
        # CSP requires the eigenvalues and eigenvector be sorted in descending order
        sorted_index = self.__order_components(covs, sample_weights, eigen_vectors, eigen_values) 
        eigen_vectors = eigen_vectors[:, sorted_index]
        sp_filters = eigen_vectors.T
        pick_filters = sp_filters[:self.n_components]
        return pick_filters, sp_filters
    
    def __decompose_covs(self, covs, sample_weights):
        n_classes = len(covs)
        if n_classes == 2:
            eigen_values, eigen_vectors = linalg.eigh(covs[0], covs.sum(0))
        else:
            # The multiclass case is adapted from
            # http://github.com/alexandrebarachant/pyRiemann
            eigen_vectors, D = self.__ajd_pham(covs)
            eigen_vectors = self.__normalize_eigenvectors(eigen_vectors.T, covs, sample_weights)
            eigen_values = None
        return eigen_vectors, eigen_values
    
    def __calculate_covariance_matrices(self, data, y_class):
        '''The data is in the form of samples x channels x sampled_time_points'''
        if len(data.shape) != 3:
            raise Exception('Dimension is not match!')
        n_samples, n_channels, n_points = data.shape    
        
        if self.cov_type == 'concat':
            cov_estimator = self.__concat_cov
        elif self.cov_type == 'epoch':
            cov_estimator = self.__epoch_cov
            
        covs = []
        sample_weights = []
        self.classes = np.unique(y_class)
        n_classes = len(self.classes)
        for id_class in self.classes:
            cov, weight = cov_estimator(data[y_class == id_class])
            if self.norm_trace:
                cov /= np.trace(cov)
            covs.append(cov)
            sample_weights.append(weight)
        return np.stack(covs), np.array(sample_weights)
    
    def __concat_cov(self, X_class):
        '''The data is in the form of samples x channels x sampled_time_points'''
        '''Concatenate epochs before computing the covariance.'''
        n_samples, n_channels, n_points = X_class.shape
        X_class = np.transpose(X_class, [1, 0, 2])
        X_class = X_class.reshape(n_channels, -1)
        # The covariace with norm trace is used to provide better results
        cov = self.__covariance(X_class)
        # cov = np.cov(X_class) 
        weight = X_class.shape[0]
        return cov, weight

    def __epoch_cov(self, X_class):
        '''The data is in the form of samples x channels x sampled_time_points'''
        '''Mean of per-epoch covariances.'''
        cov = sum(self.__covariance(data) for data in X_class)
        cov /= len(X_class)
        weight = len(X_class)
        return cov, weight
    
    def __compute_mutual_info(self, covs, sample_weights, eigen_vectors):
        class_probas = sample_weights / sample_weights.sum()

        mutual_info = []
        for jj in range(eigen_vectors.shape[1]):
            aa, bb = 0, 0
            for (cov, prob) in zip(covs, class_probas):
                tmp = np.dot(np.dot(eigen_vectors[:, jj].T, cov),
                             eigen_vectors[:, jj])
                aa += prob * np.log(np.sqrt(tmp))
                bb += prob * (tmp ** 2 - 1)
            mi = - (aa + (3.0 / 16) * (bb ** 2))
            mutual_info.append(mi)

        return mutual_info

    def __normalize_eigenvectors(self, eigen_vectors, covs, sample_weights):
        # Here we apply an euclidean mean. See pyRiemann for other metrics
        mean_cov = np.average(covs, axis=0, weights=sample_weights)

        for ii in range(eigen_vectors.shape[1]):
            tmp = np.dot(np.dot(eigen_vectors[:, ii].T, mean_cov),
                         eigen_vectors[:, ii])
            eigen_vectors[:, ii] /= np.sqrt(tmp)
        return eigen_vectors
    
    def __order_components(self, covs, sample_weights, eigen_vectors, eigen_values):
        n_classes = len(self.classes)
        ix = []
        if self.component_order == 'mutual_info' and n_classes > 2:
            mutual_info = self.__compute_mutual_info(covs, sample_weights, eigen_vectors)
            ix = np.argsort(mutual_info)[::-1]
        elif self.component_order == 'mutual_info' and n_classes == 2:
            ix = np.argsort(np.abs(eigen_values - 0.5))[::-1]
        elif self.component_order == 'alternate' and n_classes == 2:
            i = np.argsort(eigen_values)
            ix = np.empty_like(i)
            ix[1::2] = i[:len(i) // 2]
            ix[0::2] = i[len(i) // 2:][::-1]
        return ix
    
    def __ajd_pham(self, X, eps=1e-6, max_iter=15):
        '''Approximate joint diagonalization based on Pham's algorithm.
        This is a direct implementation of the PHAM's AJD algorithm [1].
        Parameters
        ----------
        X : ndarray, shape (n_epochs, n_channels, n_channels)
            A set of covariance matrices to diagonalize.
        eps : float, default 1e-6
            The tolerance for stopping criterion.
        max_iter : int, default 1000
            The maximum number of iteration to reach convergence.
        Returns
        -------
        V : ndarray, shape (n_channels, n_channels)
            The diagonalizer.
        D : ndarray, shape (n_epochs, n_channels, n_channels)
            The set of quasi diagonal matrices.
        References
        ----------
        .. [1] Pham, Dinh Tuan. 'Joint approximate diagonalization of positive
               definite Hermitian matrices.' SIAM Journal on Matrix Analysis and
               Applications 22, no. 4 (2001): 1136-1152.
        '''
        # Adapted from http://github.com/alexandrebarachant/pyRiemann
        n_epochs = X.shape[0]

        # Reshape input matrix
        A = np.concatenate(X, axis=0).T

        # Init variables
        n_times, n_m = A.shape
        V = np.eye(n_times)
        epsilon = n_times * (n_times - 1) * eps

        for it in range(max_iter):
            decr = 0
            for ii in range(1, n_times):
                for jj in range(ii):
                    Ii = np.arange(ii, n_m, n_times)
                    Ij = np.arange(jj, n_m, n_times)

                    c1 = A[ii, Ii]
                    c2 = A[jj, Ij]

                    g12 = np.mean(A[ii, Ij] / c1)
                    g21 = np.mean(A[ii, Ij] / c2)

                    omega21 = np.mean(c1 / c2)
                    omega12 = np.mean(c2 / c1)
                    omega = np.sqrt(omega12 * omega21)

                    tmp = np.sqrt(omega21 / omega12)
                    tmp1 = (tmp * g12 + g21) / (omega + 1)
                    tmp2 = (tmp * g12 - g21) / max(omega - 1, 1e-9)

                    h12 = tmp1 + tmp2
                    h21 = np.conj((tmp1 - tmp2) / tmp)

                    decr += n_epochs * (g12 * np.conj(h12) + g21 * h21) / 2.0

                    tmp = 1 + 1.j * 0.5 * np.imag(h12 * h21)
                    tmp = np.real(tmp + np.sqrt(tmp ** 2 - h12 * h21))
                    tau = np.array([[1, -h12 / tmp], [-h21 / tmp, 1]])

                    A[[ii, jj], :] = np.dot(tau, A[[ii, jj], :])
                    tmp = np.c_[A[:, Ii], A[:, Ij]]
                    tmp = np.reshape(tmp, (n_times * n_epochs, 2), order='F')
                    tmp = np.dot(tmp, tau.T)

                    tmp = np.reshape(tmp, (n_times, n_epochs * 2), order='F')
                    A[:, Ii] = tmp[:, :n_epochs]
                    A[:, Ij] = tmp[:, n_epochs:]
                    V[[ii, jj], :] = np.dot(tau, V[[ii, jj], :])
            if decr < epsilon:
                break
        D = np.reshape(A, (n_times, -1, n_times)).transpose(1, 0, 2)
        return V, D

    def __get_log_var_feats(self, spatial_filt, data):
        data_dot = np.dot(spatial_filt, data) # spatially filtered signals 
        data_var = np.var(data_dot, axis=1)
        # data_var = (data_dot**2).mean(axis=1)
        # We use log(var) instead of averaging
        data_log = np.log(data_var)
        return data_log
    
    def fit_transform(self, X, y):  
        if len(X.shape) != 3:
            raise Exception('Dimension is not match!')
                  
        n_samples = X.shape[0]
        X_transformed_var = np.zeros((len(self.bands), n_samples, self.n_components))

        for id_band, freq_band in enumerate(self.bands):
            # Compute band-pass filter of EEG signals
            X_filtered = butter_bandpass_filter(X, freq_band[0], freq_band[1], self.smp_freq, self.order)
            
            # Calculating covariance only on training set
            covs, sample_weights =  self.__calculate_covariance_matrices(X_filtered, y)
            spf_sel, spf_org = self.__get_spatial_filter(covs, sample_weights)
            self.spatial_transform[id_band] = spf_sel
                  
            # Calculate the variance of spatially filtered signals and then compute the logarithm 
            for sample in range(X_filtered.shape[0]):
                X_transformed_var[id_band, sample] = self.__get_log_var_feats(self.spatial_transform[id_band], X_filtered[sample,:,:])
        
        X_transformed_var = np.swapaxes(X_transformed_var, 0, 1)
        X_transformed_var = X_transformed_var.reshape(n_samples, -1)

        # select k best
        X_fbcsp = self.selector.fit_transform(X_transformed_var, y) 
        return X_fbcsp

    def transform(self, X):
        if len(X.shape) != 3:
            raise Exception('Dimension is not match!')
        
        n_samples = X.shape[0]
        X_transformed_var = np.zeros((len(self.bands), n_samples, self.n_components))

        for id_band, freq_band in enumerate(self.bands):
            X_filtered = butter_bandpass_filter(X, freq_band[0], freq_band[1], self.smp_freq, self.order)
            
            # Calculate the variance of spatially filtered signals and then compute the logarithm 
            for sample_te in range(X_filtered.shape[0]):
                X_transformed_var[id_band, sample_te] = self.__get_log_var_feats(self.spatial_transform[id_band], X_filtered[sample_te,:,:]) 

        X_transformed_var = np.swapaxes(X_transformed_var, 0, 1)
        X_transformed_var = X_transformed_var.reshape(n_samples, -1)
        # select k best
        X_fbcsp = self.selector.transform(X_transformed_var)  
        return X_fbcsp
Example #47
0
y = lrdf_condensed['draft_overall'].astype(float)

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

import matplotlib.pyplot as plt

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=1)

fs = SelectKBest(score_func=f_regression, k='all')
fs.fit(X_train, y_train)
X_train_fs = fs.transform(X_train)
X_test_fs = fs.transform(X_test)

scores = pd.DataFrame(columns=['col', 'score'])
for i in range(len(fs.scores_)):
    col = X_train.columns[i]
    score = fs.scores_[i]
    scores = scores.append({'col': col, 'score': score}, ignore_index=True)

plt.bar(scores['col'], scores['score'])
plt.show()

scores = scores.sort_values(by='score', ascending=False)
# The information yielded by this cell gives us feature selection

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingClassifier
from itertools import product
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB

iris = load_iris()

X = iris.data
y = iris.target
feature_names = iris.feature_names
# XSepalLength = X[:,0]
# YSepalWidth = X[:,1]

skb = SelectKBest(k='all')
skb.fit(X, y)
X_vec = skb.transform(X)

from sklearn.feature_selection import mutual_info_classif

feature_scores = mutual_info_classif(X_vec, y)

print 'Dwa najlepsze atrybuty to {0}, {1}'.format(
    *sorted(zip(feature_scores, feature_names), reverse=True))
# /\ Odpowiedź dla zadania pierwszego | zadanie drugie \/
"""
+ Załaduj zbiór danych __iris__ korzystając z funkcji [load_iris](http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html)
+ Korzystając z funkcji [SelectKBest](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html) oraz kryterium [mutual_info_classif](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#sklearn.feature_selection.mutual_info_classif) wybierz najlepsze __dwa__ atrybuty 
"""

X = iris.data[:, [0, 2]]
y = iris.target
Example #49
0
X, X_t, y, y_t = train_test_split(X, y, test_size=0.4, random_state=42)

###############################################################################
#                                                                             #
#   SELECT K BEST                                                             #
#                                                                             #
###############################################################################

from sklearn.feature_selection import SelectKBest, f_classif

# ### Define the dimension reduction to be used.
# Here we use a classical univariate feature selection based on F-test,
# namely Anova. We set the number of features to be selected to 500
feature_selection = SelectKBest(f_classif, k=784)

# transform datasets from high import dimensional to k-dimensional
X = feature_selection.fit_transform(X, y)
X_t = feature_selection.transform(X_t)

# save output to csv files
import csv
with open('train.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for i in range(len(X)):
        a.writerow([y[i]] + list(X[i]))

with open('test.csv', 'w') as fp:
    a = csv.writer(fp, delimiter=',')
    for i in range(len(X_t)):
        a.writerow([y[i]] + list(X_t[i]))
Example #50
0
def train_model(config, feature_vectors, labels, classifier_model = 'random_forest', scale = True, normalize = False, kBest = False):

    if(config.has("model")):
        classifier_model = config.get("model")
        scale = config.get("scale")
        normalize = config.get("normalize")
        kBest = config.get("k_best")

    print classifier_model
    print 'Scale:',
    print scale
    print 'Normalize:',
    print normalize
    print 'K-Best',
    print kBest

    classifier = dict()

    if(scale):
        scaler = StandardScaler()
        scaler.fit(feature_vectors)
        feature_vectors = scaler.transform(feature_vectors)
        classifier['scaler'] = scaler

    if(normalize):
        normalizer = Normalizer()
        normalizer.fit(feature_vectors)
        feature_vectors = normalizer.transform(feature_vectors)
        classifier['normalizer'] = normalizer

    if(kBest):
        kBest = SelectKBest(f_classif, k=20)
        kBest = kBest.fit(feature_vectors, labels)
        feature_vectors = kBest.transform(feature_vectors)
        classifier['k_best'] = kBest

    #print feature_vectors.shape
    if classifier_model == 'random_forest':
        model = RandomForestClassifier()
        model.fit(feature_vectors, labels)
    elif classifier_model == 'knn':
        k = 3
        model = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform')
        model.fit(feature_vectors, labels)
    elif classifier_model == 'logistic_regression':
        model = LogisticRegression()
        model.fit(feature_vectors, labels)
    elif classifier_model == 'svm':
        model = svm.LinearSVC()
        model.fit(feature_vectors, labels)
    elif classifier_model == 'sgd':
        model = SGDClassifier(loss="modified_huber", penalty="l1")
        model.fit(feature_vectors, labels)
    elif classifier_model == 'nn':
        model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
        model.fit(feature_vectors, labels)
    elif classifier_model == 'dtree':
        model = tree.DecisionTreeClassifier()
        model.fit(feature_vectors, labels)
    elif classifier_model == 'gaussianNB':
        model = GaussianNB()
        model.fit(feature_vectors, labels)

    classifier['model'] = model
    return classifier
Example #51
0
div = Div(text="""Your <a href="https://en.wikipedia.org/wiki/HTML">HTML</a>-supported text is initialized with the <b>text</b> argument.  The
remaining div arguments are <b>width</b> and <b>height</b>. For this example, those values
are <i>200</i> and <i>100</i> respectively.""",
width=200, height=100)
'''
stats = Paragraph(text='', width=800, height=200, name='Selected Features:')

#columns =['avg_dist', 'avg_rating_by_driver','avg_rating_of_driver','avg_surge','surge_pct','trips_in_first_30_days','luxury_car_user','weekday_pct','city_Astapor',"city_KingsLanding",'city_Winterfell','phone_Android','phone_no_phone']
#columns = ['luxury_car_user','avg_dist','city_Astapor',"city_KingsLanding",'phone_Android','phone_iPhone']

#df1 = pd.DataFrame(df, columns=columns)
#y = df['churn']
y = df[df.columns[:1]].values.ravel()
df1 = df.drop(df.columns[:1], axis=1)
selector = SelectKBest(chi2, k=5).fit(df1, y)
X_new = selector.transform(df1)
mask = selector.get_support()  #list of booleans
new_features = []  # The list of your K best features

for bool, feature in zip(mask, df.columns[1:].tolist()):
    if bool:
        new_features.append(feature)

#print(new_features)

stats.text = str(new_features)

x_train_original, x_test_original, y_train_original, y_test_original = train_test_split(
    X_new, y, test_size=0.25)
#For standardizing data
Example #52
0
def feature_analysis(X_train, X_test, y_train, y_test, i, X_1k, y_1k):
    ''' Select the K best features from the corresponding model

    Parameters:
       X_train: NumPy array, with the selected training features
       X_test: NumPy array, with the selected testing features
       y_train: NumPy array, with the selected training classes
       y_test: NumPy array, with the selected testing classes
       i: int, the index of the supposed best classifier (from task 3.1)
       X_1k: numPy array, just 1K rows of X_train (from task 3.2)
       y_1k: numPy array, just 1K rows of y_train (from task 3.2)
    '''
    k_list = [5, 10, 20, 30, 40, 50]
    best_k_1 = []
    best_k_32 = []
    X_new_1k = []
    X_new_32k = []

    acc_list = []

    if i == 1:
        classifier = SVC(kernel='linear', max_iter=1000)
    elif i == 2:
        classifier = SVC(kernel='rbf', gamma=2.0, max_iter=1000)
    elif i == 3:
        classifier = RandomForestClassifier(n_estimators=10, max_depth=5)
    elif i == 4:
        classifier = MLPClassifier(alpha=0.05)
    else:
        classifier = AdaBoostClassifier()

    for k in k_list:
        curr = [k]
        selector = SelectKBest(f_classif, k=k)
        Xk_1k = selector.fit_transform(X_1k, y_1k)
        pp = selector.pvalues_.argsort()[:k]
        curr.extend(pp)
        best_k_1.append(curr)
        if k == 5:
            classifier.fit(Xk_1k, y_1k)
            predictor = classifier.predict(selector.transform(X_test))
            confusion = confusion_matrix(y_test, predictor)
            acc_list.append(accuracy(confusion))

    for k in k_list:
        curr = [k]
        selector = SelectKBest(f_classif, k=k)
        Xk_32k = selector.fit_transform(X_train, y_train)
        pp = selector.pvalues_.argsort()[:k]
        curr.extend(pp)
        best_k_32.append(curr)
        if k == 5:
            classifier.fit(Xk_32k, y_train)
            predictor = classifier.predict(selector.transform(X_test))
            confusion = confusion_matrix(y_test, predictor)
            acc_list.append(accuracy(confusion))

    with open('a1_3.3.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        for i in best_k_32:
            writer.writerow(i)
        writer.writerow(acc_list)
    csvfile.close()
Example #53
0
	for N in range(1,MAX_NGRAM+1):
		grams = [terms[j:j+N] for j in range(len(terms)-N+1)]
		for gram in grams:
			phrase = " ".join(gram)
			if phrase in phrase_bit_nr:
				X_test[i,phrase_bit_nr[phrase]] = 1				

	Y_test[i] = test_y[i]

print("Selecting features...")

SKB = SelectKBest(chi2, k=FEATURES)
SKB.fit(X_train, Y_train)

selected_features = SKB.get_support(indices=True)
X_train = SKB.transform(X_train)
X_test = SKB.transform(X_test)

tm1 = MultiClassTsetlinMachine(c, T*100, s, clause_drop_p=drop_clause, number_of_state_bits=number_of_state_bits, number_of_gpus=n_gpus)

f = open("imdb_weighted_%.1f_%d_%d_%.2f_%d_aug.txt" % (s, clauses, T,  drop_clause, number_of_state_bits), "w+")

r_25 = 0
max = 0.0

for i in range(config.stop_train):
	start_training = time()
	tm1.fit(X_train, Y_train, epochs=1, incremental=True)
	stop_training = time()

	start_testing = time()
Example #54
0
class EnsembleSVM:
    def __init__(self,
                 n_estimators=50,
                 max_samples=1000,
                 max_features=2000,
                 n_randomized_search_iter=20,
                 random_state=123):

        random.seed(random_state)
        self.random_state = random_state
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.n_randomized_search_iter = n_randomized_search_iter

    def _prepare_classifier(self, params, n_jobs=1):

        X_train, y_train = params

        tuned_parameters = [{
            'kernel': ['rbf'],
            'gamma': [1e-4, 1e-3, 1e-2, 1e-1, 1e+0, 1e+1, 1e+2, 1e+3, 1e+4],
            'C': [1e+0, 1e+1, 1e+2, 1e+3, 1e+4, 1e+5, 1e+6, 1e+7, 1e+8, 1e+9]
        }]

        clf = RandomizedSearchCV(svm.SVC(random_state=self.random_state),
                                 tuned_parameters[0],
                                 n_iter=self.n_randomized_search_iter,
                                 n_jobs=n_jobs,
                                 random_state=self.random_state)
        clf.fit(X_train, y_train)

        params = clf.best_params_
        clf = svm.SVC(kernel=params['kernel'],
                      C=params['C'],
                      gamma=params['gamma'],
                      probability=True,
                      random_state=self.random_state)
        clf.fit(X_train, y_train)

        return clf

    def fit(self, X, y):

        self.selector = SelectKBest(f_classif, k=self.max_features)
        self.selector.fit(X, y)

        X_train = self.selector.transform(X)
        y_train = y

        param_list = []
        idx = range(len(y_train))
        for i in range(self.n_estimators):
            random.shuffle(idx)
            param_list.append((X_train[idx[:self.max_samples]],
                               y_train[idx[:self.max_samples]]))

        pool = ThreadPool(cpu_count())
        self.clf_list = pool.map(self._prepare_classifier, param_list)
        pool.close()
        pool.join()
        """
        X2=[]
        for clf in self.clf_list:
            P=clf.predict_proba(X_train)
            if len(X2)==0:
                X2=P[:, 0]
            else:
                X2=numpy.vstack((X2, P[:, 0]))
        X2=numpy.swapaxes(X2, 0, 1)
        print "X2:", X2.shape

        from sklearn.ensemble import RandomForestClassifier
        self.clf2=RandomForestClassifier(n_estimators=100)
        self.clf2.fit(X2, y_train)
        """

    def predict_proba(self, X):
        y_pred = self._predict_cover_proba(X)
        return [[float(x) / 100, 1 - float(x) / 100] for x in y_pred]

    def _predict_cover_proba(self, X):
        X_val = self.selector.transform(X)
        y_val_pred = [0] * len(X_val)
        for clf in self.clf_list:
            P = clf.predict_proba(X_val)
            for i in range(len(P)):
                y_val_pred[i] += P[i][0]
        return y_val_pred
        """
        X2=[]
        Xt=self.selector.transform(X)
        for clf in self.clf_list:
            P=clf.predict_proba(Xt)
            if len(X2)==0:
                X2=P[:, 0]
            else:
                X2=numpy.vstack((X2, P[:, 0]))
        X2=numpy.swapaxes(X2, 0, 1)
        print "X2 predict:", X2.shape

        return self.clf2.predict_proba(X2)[:,0]
        """

    def score(self, X, y):
        y_pred = self._predict_cover_proba(X)
        ok = 0
        for i in range(len(y)):
            p = float(y_pred[i]) / len(self.clf_list)
            if p > 0.5 and y[i] == 0: ok += 1
            elif p <= 0.5 and y[i] == 1: ok += 1

        return float(ok) / len(y)
Example #55
0
        # Create a scores array to get the individual categorical column.
        # Example:
        #  data = [39, 'State-gov', 77516, 'Bachelors', 13, 'Never-married', 'Adm-clerical',
        #         'Not-in-family', 'White', 'Male', 2174, 0, 40, 'United-States']
        #  scores = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
        #
        # Returns: [['State-gov']]
        # Build the scores array.
        scores = [0] * len(COLUMNS[:-1])
        # This column is the categorical column we want to extract.
        scores[i] = 1
        skb = SelectKBest(k=1)
        skb.scores_ = scores
        # Convert the categorical column to a numerical value
        lbn = LabelBinarizer()
        r = skb.transform(train_features)
        lbn.fit(r)
        # Create the pipeline to extract the categorical feature
        categorical_pipelines.append(('categorical-{}'.format(i),
                                      Pipeline([('SKB-{}'.format(i), skb),
                                                ('LBN-{}'.format(i), lbn)])))
# [END categorical-feature-conversion]

# [START create-pipeline]
# Create pipeline to extract the numerical features
skb = SelectKBest(k=6)
# From COLUMNS use the features that are numerical
skb.scores_ = [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
categorical_pipelines.append(('numerical', skb))

# Combine all the features using FeatureUnion
Example #56
0
                      max_df=0.9,
                      use_idf=1,
                      smooth_idf=1,
                      sublinear_tf=1)
trn_term_doc = vec.fit_transform(train[column])
test_term_doc = vec.transform(test[column])

train_x = trn_term_doc.tocsr()
test_x = test_term_doc.tocsr()
train_y = (train["class"] - 1).astype(int)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
model1 = SelectKBest(chi2, k=10000)
train_x = model1.fit_transform(train_x, train_y)
test_x = model1.transform(test_x)

#################################


def stacking(clf, train_x, train_y, test_x, clf_name, class_num=1):
    train = np.zeros((train_x.shape[0], class_num))
    test = np.zeros((test_x.shape[0], class_num))
    test_pre = np.zeros((folds, test_x.shape[0], class_num))
    cv_scores = []
    for i, (train_index, test_index) in enumerate(kf):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]
Example #57
0
pipeline = sklearn.pipeline.Pipeline(steps)
cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
y_predictions = cv.predict(X_test)
report = classification_report(y_test, y_predictions)

# Print out the best value for the Decision tree classifier
print(report)
print(cv.best_params_)


# select features
k=10
feature_selector = SelectKBest(k=k)
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)
#feature_names = [ feature_names[i] for i in feature_selector.get_support(indices=True) ]
print("features selected: %d" % X_train.shape[1])

# train decision tree
dt = DTC(min_samples_split=2)
dt.fit(X_train, y_train)

# report accuracy
print("the decision tree has %d nodes" % dt.tree_.node_count)
print("train accuracy: %f" % dt.score(X_train, y_train))
print("test accuracy: %f" % dt.score(X_test, y_test))


############################# k nearest neighbor #############################
Example #58
0
# 卡方检验
#经典的卡方检验是检验定性自变量对定性因变量的相关性。
# 卡方检验计算:
#   假设有两个分类变量X和Y,它们的值域分别为{x1, x2}和{y1, y2},其样本频数列联表为:
# 参考图示(独立样本四格表):
# 	年龄 消费
#  少年     高
#  大叔     少
#    y1    y2    总计
# x1   a    b  a+b
# x2   c    d  c+d
# 总计 a+c b+d a+b+c+d
#   若要推断的论述为H1:“X与Y有关系”,可以利用独立性检验来考察两个变量是否有关系,并且能较精确地给出这种判断的可靠程度。
# 具体的做法是,由表中的数据算出随机变量K^2的值(即K的平方)
#   K^2 = n (ad - bc) ^ 2 / [(a+b)(c+d)(a+c)(b+d)] 其中n=a+b+c+d为样本容量
#   K^2的值越大,说明“X与Y有关系”成立的可能性越大。
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2  #选择K个最好的特征,返回选择特征后的数据
from sklearn.datasets import load_iris
iris = load_iris()
# k Number of top features to select. The “all” option bypasses selection, for use in a parameter search.
selector = SelectKBest(chi2, k=4).fit(iris.data, iris.target)
data = selector.transform(iris.data)
print(data)
print(selector.scores_)
Example #59
0
print("n_samples: %d, n_features: %d" % X_test_counts.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train_counts, y_train)
    X_test = ch2.transform(X_test_counts)
    if feature_names:
        # keep selected feature names
        feature_names = [
            feature_names[i] for i in ch2.get_support(indices=True)
        ]
    print("done in %fs" % (time() - t0))
    print(feature_names)
    print(ch2.scores_)
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


def trim(s):
Example #60
0
l_svm_score = []
l_nb_score = []
l_blend_score = []

alpha = 1.
beta = 20.

#sss = StratifiedShuffleSplit(Y, 5, test_size=0.2, random_state=0)
sss = KFold(len(Y), n_folds=5, shuffle=True)
kbest = SelectKBest(chi2, k=300000)
for train_idx, val_idx in sss:
    x_train, y_train, x_val, y_val = X[train_idx], Y[train_idx], X[val_idx], Y[
        val_idx]
    x_train = kbest.fit_transform(x_train, y_train)
    x_val = kbest.transform(x_val)
    #
    clf_svm.fit(x_train, y_train)
    svm_predict_proba = getProbaSVM(clf_svm.decision_function(x_val))
    score_svm = accuracy_score(clf_svm.predict(x_val), y_val)
    l_svm_score.append(score_svm)
    print "l_svm_score"
    #
    clf_nb.fit(x_train, y_train)
    nb_predict_proba = clf_nb.predict_proba(x_val)
    score_nb = accuracy_score(clf_nb.predict(x_val), y_val)
    l_nb_score.append(score_nb)
    print "l_nb_score"
    #
    blend_mat = alpha * nb_predict_proba + beta * svm_predict_proba
    y_pred_blend = clf_svm.classes_[np.argmax(blend_mat, 1)]