Example #1
7
def select_features(X, y):
  from sklearn.feature_selection import SelectPercentile
  from sklearn.feature_selection import f_classif,chi2
  from sklearn.preprocessing import Binarizer, scale

  # First select features based on chi2 and f_classif
  p = 3

  X_bin = Binarizer().fit_transform(scale(X))
  selectChi2 = SelectPercentile(chi2, percentile=p).fit(X_bin, y)
  selectF_classif = SelectPercentile(f_classif, percentile=p).fit(X, y)

  chi2_selected = selectChi2.get_support()
  chi2_selected_features = [ f for i,f in enumerate(X.columns) if chi2_selected[i]]
  print('Chi2 selected {} features {}.'.format(chi2_selected.sum(),
     chi2_selected_features))
  f_classif_selected = selectF_classif.get_support()
  f_classif_selected_features = [ f for i,f in enumerate(X.columns) if f_classif_selected[i]]
  print('F_classif selected {} features {}.'.format(f_classif_selected.sum(),
     f_classif_selected_features))
  selected = chi2_selected & f_classif_selected
  print('Chi2 & F_classif selected {} features'.format(selected.sum()))
  features = [ f for f,s in zip(X.columns, selected) if s]
  print (features)
  return features
Example #2
0
def buildVectorizer(classes, examples, parameters):
	featureChoice = None
	doFeatureSelection = False
	tfidf = False
	featureSelectPerc = 10
		
	if "featureChoice" in parameters:
		featureChoice = parameters["featureChoice"]
	if "doFeatureSelection" in parameters and parameters["doFeatureSelection"] == "True":
		doFeatureSelection = True
	if "featureSelectPerc" in parameters:
		featureSelectPerc = int(parameters["featureSelectPerc"])
	if "tfidf" in parameters and parameters["tfidf"] == "True":
		tfidf = True
		
	print "Starting vectorizer..."
	vectorizer = Vectorizer(classes,examples,featureChoice,tfidf)
	vectors = vectorizer.getTrainingVectors()
	print "Vectors of size:", vectors.shape

	if doFeatureSelection:
		print "Trimming training vectors..."
		from sklearn.feature_selection import SelectKBest,SelectPercentile,chi2
		#featureSelector = SelectKBest(chi2, k=100)`:
		featureSelector = SelectPercentile(chi2,featureSelectPerc)
		vectorsTrimmed = featureSelector.fit_transform(vectors, classes)
		vectorsTrimmed = coo_matrix(vectorsTrimmed)
		print "Trimmed training vectors of size:", vectorsTrimmed.shape
	else:
		vectorsTrimmed = vectors
		featureSelector = None

	return vectorsTrimmed,vectorizer,featureSelector
Example #3
0
def selectFeatures(features, labels, features_list):
    '''
    Select features according to the 20th percentile of the highest scores. 
    Return a list of features selected  and a dataframe showing the ranking 
    of each feature related to their p values
    features: numpy array with the features to be used to test sklearn models
    labels: numpy array with the real output 
    features_list: a list of names of each feature
    '''
    #feature selection
    selector = SelectPercentile(f_classif, percentile=20)
    selector.fit(features, labels)
    features_transformed = selector.transform(features)
    #filter names to be returned
    l_rtn = [x for x, t in zip(features_list, 
        list(selector.get_support())) if t]
    # pd.DataFrame(features_transformed, columns = l_labels2).head()
    #calculate scores
    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()
    df_rtn = pd.DataFrame(pd.Series(dict(zip(features_list,scores))))
    df_rtn.columns = ["pValue_Max"]
    df_rtn = df_rtn.sort("pValue_Max", ascending=False)
    # df_rtn["different_from_zero"]=((df!=0).sum()*1./df.shape[0])


    return l_rtn, df_rtn
def preprocess(words_file = "word_data.pkl", authors_file="email_authors.pkl"):
    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Enrique training emails:", sum(labels_train)
    print "no. of Juan training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #5
0
 def feature_select(self):
     b = SelectPercentile(f_classif, percentile=task.percentile)
     y = np.array(self.results[self.task.label].data)
     X = np.array(self.results[self.task.features].data)
     data = pd.DataFrame(b.fit_transform(X, y))
     result = TransformResult(self.task, 1.0, data)
     self.results[self.task.uuid] = result
Example #6
0
def select_features(X,y):
    selector = SelectPercentile(f_classif, percentile=10)
    print "fit selector"
    selector.fit(X, y)
    print "transform features"
    X = selector.transform(X)
    return X,selector
Example #7
0
def get_user_feature(feature_type,behavior,num_feature=800):
    X_train = get_features(feature_type,behavior)
    index = X_train.index
    # 对X进行降维
    Y = pd.read_csv('data/train_Y_%d.csv'%behavior, index_col='user_id')['type']
    print 'start selectKbest...'
    # select = SelectKBest(chi2,k=min(num_feature,X_train.shape[1]))
    percent = 0
    if feature_type == 'cat_id':
        percent = 60
    elif feature_type == 'brand_id':
        percent = 15
    elif feature_type == 'seller_id':
        percent = 20
    select = SelectPercentile(f_classif, percentile=percent)
    select.fit(X_train,Y)
    X_train = select.transform(X_train)

    print 'end select...'
    print 'write %s features to train file' % feature_type
    train_feature_file_name = 'data/train_feature_%s_%d.csv' % (feature_type,behavior)
    DataFrame(X_train,index=index).to_csv(train_feature_file_name)

    # 用同样的列降维对应的测试集数据
    X_test = get_features(feature_type,behavior,is_train=False)
    index = X_test.index
    X_test = select.transform(X_test)
    # 写入文件
    print 'write %s features to test file' % feature_type
    test_feature_file_name = 'data/test_feature_%s_%d.csv' % (feature_type,behavior)
    DataFrame(X_test,index=index).to_csv(test_feature_file_name)
    print 'end....'
def test_select_percentile_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def final_feature_set_reduction(reduced_feature_file_name, final_file_name, train_label_file):
    sorted_train_data = pd.read_csv('data/' + reduced_feature_file_name)
    y = []
    X = sorted_train_data.iloc[:,1:]
    fip = open('data/' + train_label_file)
    lines = fip.readlines()
    for line in lines:
        line = line.rstrip()
        y.append(int(line))

    print("Final feature reduction: {:s}".format(reduced_feature_file_name))
    print("Training labels length: {:d}".format(len(y)))
    print("X Feature set dimensionality: {:d} {:d}".format(X.shape[0], X.shape[1]))
    print("In Feature set dimensionality: {:d} {:d}".format(sorted_train_data.shape[0], sorted_train_data.shape[1]))

    # find the top 10 percent variance features, from ~1000 -> ~100 features
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    print("Final 10 Percent Dimensions: {:d} {:d}".format(X_new_10.shape[0], X_new_10.shape[1]))
    
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1

    #data_reduced = sorted_train_data.iloc[:,[0] + selected_names]
    #Does not put the file_name as the first column.
    data_trimmed = sorted_train_data.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_train_data['file_name'])
    data_reduced = data_fnames.join(data_trimmed)
    
    data_reduced.to_csv('data/' + final_file_name, index=False)
    print("Completed reduction in {:s}".format(final_file_name))
    
    return
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def preprocess(word_data, targets):
    print("\n### PREPROCESSING DATA ###")

    # vectorize
    print("-- Vectorization")
    vectorizer = TfidfVectorizer(sublinear_tf=True)  # , stop_words='english'
    data_transformed = vectorizer.fit_transform(word_data)

    # feature selection
    print("-- Feature Selection")
    selector = SelectPercentile(percentile=5)
    data_selected = selector.fit_transform(data_transformed, targets)
    if data_selected.shape[1] == 0:
        data_selected = data_transformed
    else:
        print("Top {} features were selected".format(data_selected.shape[1]))

        # print top features
        nr_features = 30
        i = selector.scores_.argsort()[::-1][:nr_features]
        top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i],
                                        selector.scores_[i],
                                        selector.pvalues_[i]))
        print("\nTop %i Features:" % nr_features)
        print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n")

    features_train, features_test, labels_train, labels_test = \
        train_test_split(data_selected, targets, test_size=0.2, stratify=targets)

    return features_train, features_test, labels_train, labels_test
Example #13
0
def main():

    main_data = pd.read_csv('../data/train.csv', index_col='ID')

    output = []
    for x in main_data.columns:
        output.append({
            'variable': x,
            'variance': main_data.ix[:, x].var(),
            'corr_w_target': round(main_data.ix[:, x].corr(main_data.TARGET), 4),
            'abs_corr': abs(round(main_data.ix[:, x].corr(main_data.TARGET), 4))}
        )

    # print csv for later in the presentation docs
    variable_selector = pd.DataFrame(output)
    variable_selector = variable_selector.set_index('variable')
    variable_selector = variable_selector.drop('TARGET')
    variable_selector.sort_values('abs_corr', ascending=False).to_csv('../presentationDocs/corrs.csv')

    selector = SelectPercentile(f_classif, percentile=25)
    subset = pd.DataFrame(selector.fit_transform(main_data.drop('TARGET', axis=1), main_data['TARGET']))

    subset.to_csv('../data/main_data.csv', index=False)
    main_data[['TARGET']].to_csv('../data/target.csv', cols=['TARGET'], index=False)

    # print transformed test data to csv
    test_data = pd.read_csv('../data/test.csv', index_col='ID')
    test_data = pd.DataFrame(selector.transform(test_data), index=test_data.index)
    test_data.to_csv('../data/test_transform.csv', index=True, index_label='ID')
def test_select_percentile_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the percentile heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectPercentile(f_regression, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_best_scores_kept(univariate_filter)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='percentile', param=25).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
    X_2 = X.copy()
    X_2[:, np.logical_not(support)] = 0
    assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
    # Check inverse_transform respects dtype
    assert_array_equal(X_2.astype(bool),
                       univariate_filter.inverse_transform(X_r.astype(bool)))
def preprocess(article_file, lable_file, k):

    features = pickle.load(open(article_file))
    features = np.array(features)

    # transform non-numerical labels (as long as they are hashable and comparable) to numerical labels
    lables = pickle.load(open(lable_file))
    le = preprocessing.LabelEncoder()
    le.fit(lables)
    lables = le.transform(lables)
    # print le.inverse_transform([0])

    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features)

    # selector : SelectPercentile
    selector = SelectPercentile(f_classif, percentile=k)
    selector.fit(features_train_transformed, lables)

    # selector : chi2
    # selector = SelectPercentile(score_func=chi2)
    # selector.fit(features_train_transformed, lables)

    features_train_transformed = selector.transform(features_train_transformed).toarray()

    return features_train_transformed, lables, vectorizer, selector, le, features
Example #16
0
def main():
    parser = argparse.ArgumentParser(description='Feature Selection') 
    required = parser.add_argument_group('required options') 
    
    required.add_argument('-x', '--scaledfeaturelist', required=True, help='File containing feature values') 
    required.add_argument('-y', '--targetdata', required=True, help='File containiing target data')
    required.add_argument('-z', '--fetpercentile', required=True, type=int, help='Percentile to select highest scoring percentage of features')
    
    args = parser.parse_args()

    X = np.loadtxt(args.scaledfeaturelist) 
    Y = np.genfromtxt(args.targetdata,dtype='str')
   
    #result = SelectPercentile(f_classif, percentile=args.fetpercentile).fit_transform(X,Y)
    sel = SelectPercentile(f_classif, percentile=args.fetpercentile)
    result = sel.fit_transform(X,Y)
    
    #selecting features for test programs
    if os.path.isfile('variancefeatures.txt'):
        varianceFeature = np.genfromtxt("variancefeatures.txt", dtype='str')
        featureFromSelectPercentile = sel.get_support(indices=True)
        featureFileforSelectPercentile = open("featuresToTestPrograms","w")
        for i in featureFromSelectPercentile:
            featureFileforSelectPercentile.write(varianceFeature[i])
            featureFileforSelectPercentile.write("\n")
        featureFileforSelectPercentile.close()   
    #remove the variancefeatures as we don't need it anymore
    rm variancefeatures.txt

    np.savetxt('featurelist', result, fmt='%.2f', delimiter='\t')
Example #17
0
 def univariant_feature_selection(self,method, X, y,percentile):
     
     test=SelectPercentile(method , percentile=percentile).fit(X, y)
     print("The number of feature in ", method, " is: ", (test.get_support().sum()) )
     for i in range(len(self.X_train.columns)):
         if(test.get_support()[i]):
             print(self.X_train.columns[i])
     return  test.get_support()  
Example #18
0
def build_linear_model(X, y, analyzerType):
	tfv = vectorizer(analyzerType)
	select = SelectPercentile(score_func=chi2, percentile=15)
	clf = SVC(C=12.0, kernel='linear')

	X = tfv.fit_transform(X)
	X = select.fit_transform(X, y)
	return (clf.fit(X, y), tfv, select)
Example #19
0
def test(X, y):
       
    ###############################################################################
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 10% most significant features
    selector = SelectPercentile(f_regression, percentile=20)
    selector.fit(X, y)
    print [zero_based_index for zero_based_index in list(selector.get_support(indices=True))]
Example #20
0
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
def main(path,filename):

	#batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12']
	batchs = ['patronesCirculaesByN_2_5','patronesCirculaesByN_2_9','patronesCirculaesByN_3_9','patronesCirculaesByN_5_9','patronesCirculaesByN_3_5','patronesCirculaesByN_6_12','patronesCirculaesByN_8_12','patronesCirculaesByN_10_12']
	#batchs = ['histogramaByN','histogramaColor','patrones2x2ByN','patronesCircularesByN_2_5','patronesCircularesByN_2_9','patronesCircularesByN_3_9','patronesCircularesByN_5_9','patronesCircularesByN_3_5']
	#batchs = ['patrones2x2ByN','patrones3x3ByN','patronesCirculaesByN_2_5','patronesCirculaesByN_3_5']
	percentil = 20
	X = []
	y = []
	lens = []
	load_batch(y,path,'clases',filename) 
	y = [j for i in y for j in i]
	for batch in batchs:
		load_batch(X,path,batch,filename)
		lens.append(len(X[0]))
	
	total = [lens[0]]
	for i in xrange(1,len(lens)):
		total.append(lens[i]-lens[i-1])
	print 'Cantidad de atributos por barch'
	print total
	sp = SelectPercentile(chi2,percentil)
	X_new = sp.fit_transform(X, y)
	sup = sp.get_support(True)
	#print sup
	res = [0]* len(batchs)
	for i in sup:
		for j in xrange(0,len(lens)):
			if i <= lens[j]:
				res[j] +=1
				break
	porcentajes = []
	for i in xrange(0,len(lens)):
		porcentajes.append((1.0*res[i])/total[i])
	print 'Cantidad de variables seleccionas en el'+str(percentil)+'percentil univariado'
	print res

	print 'Porcentaje de variables seleccionas en el'+str(percentil)+'percentil univariado'
	print porcentajes
	
	clf = ExtraTreesClassifier()
	clf = clf.fit(X, y)
	fi = clf.feature_importances_

	res2 = [0]* len(batchs)
	for i in xrange(0,len(fi)):
		for j in xrange(0,len(lens)):
			if i <= lens[j]:
				res2[j] += fi[i]
				break
	print 'Importancia porcentual acumulada de la seleccion multivariada'
	print res2
	porcentajes2 = []
	for i in xrange(0,len(lens)):
		porcentajes2.append((1.0*res2[i])/total[i])

	print 'Importancia porcentual promedio por variable de la seleccion multivariada'
	print porcentajes2
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result

    # selector = SelectPercentile(f_classif, percentile=10)

    ## <Temporary hack for Lesson 3>
    selector = SelectPercentile(f_classif, percentile=1)

    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Chris training emails:", sum(labels_train)
    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #23
0
 def feature_selection(self,mode='F'):
     
     print 'Feature Selection...'
     print 'Start:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     X=self.train.copy()
     y=self.train_label['label'].values.copy()
     
     test=self.test.copy()
     
     if mode.upper()=='M':
         mi=mutual_info_classif(train.values,train_label['label'].values)
     elif mode.upper()=='F':
         F,pval=f_classif(train.values,train_label['label'].values)
     elif mode.upper()=='C':
         chi,pval=chi2(train.values,train_label['label'].values)
     
     features=self.train.columns.copy()
     
     fs_features=features.copy().tolist()
     
     if mode.upper()=='M':
         fs_V=mi.copy().tolist()
     elif mode.upper()=='F':
         fs_V=F.copy().tolist()
     elif mode.upper()=='C':
         fs_V=chi.copy().tolist()
     
     if mode.upper()=='M':
         selector=SelectPercentile(mutual_info_classif,percentile=80)
     elif mode.upper()=='F':
         selector=SelectPercentile(f_classif,percentile=80)
     elif mode.upper()=='C':
         selector=SelectPercentile(chi2,percentile=80)
         
     X_new=selector.fit_transform(X,y)
     
     selected=selector.get_support()
     
     for i in xrange(len(features)):
         if selected[i]==False:
             t=features[i]
             fs_features.remove(t)
             
     fs_V=np.array(fs_V)
     fs_features=np.array(fs_features)
     
     self.train=pd.DataFrame(X_new,columns=fs_features.tolist())
     self.test=test[fs_features]
     
     self.fs_features=fs_features
     
     feas=pd.DataFrame()
     feas['feature']=fs_features
     
     print 'End:' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
     
     return X_new,feas
Example #24
0
def selectFeatures(Model, X, y):
    model = Model()
    fsel = SelectPercentile(score_func=f_classif, percentile=5)
    fsel.fit(X, y)
    arr = fsel.get_support()
    print "features: ", np.where(arr == True)
    plt.hist(model.predict(X))
    plt.hist(y)
    plt.show()
Example #25
0
    def getWeights(self):
        # Univariate feature selection with F-test for feature scoring
        # We use the default selection function: the 10% most significant features
        selector = SelectPercentile(f_classif, percentile=10)
        selector.fit(self.X, self.y)
        scores = -np.log10(selector.pvalues_)
        scores /= float(scores.max())

        return scores
def selectFeatures(X, y):
    # feature selection with F-test for feature scoring
    # 10% most significant features
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(X, y)

    scores = -np.log10(selector.pvalues_)
    scores /= scores.max()

    return selector, scores
Example #27
0
def predict(classifier_type="tree",selection="Univariate", f="1"):

	if (f=="1"):
		kc_fn = "GS_pickles\kmeans_Genes_87_1x_v3.pkl"
		p = 1
		BIG_C = 0.001
	if (f=="2"):
		kc_fn = "GS_pickles\kmeans_Genes_433_50x_v2.pkl"
		p = 5
		BIG_C = 0.1
	if (f=="3"):
		kc_fn = "GS_pickles\kmeans_Genes_2163_20x_v1.pkl"
		p = 25
		BIG_C = 2
 	dump_data = False
	kernel_type = "linear"
	(data_matrix, features, samples) = readData()
	x = data_matrix.data
	y = data_matrix.target
	target_names = data_matrix.target_names
	x_indices = np.arange(x.shape[-1])
	(m,n) = x.shape

	test = joblib.load("GS_pickles\imputed_test_data.pkl")
	test_x = np.array(test)
	(i,j) = test_x.shape
	print "Training matrix shape: %s,%s" %(m,n)
	print "Test matrix shape: %s,%s" %(i,j)

	trimmed_x = []
	trimmed_test_x = []

	if (selection=="Univariate"):
		selector = SelectPercentile(f_classif, percentile=p)
		selector.fit(x, y)
		# Trimming the matrix, now should contain x% of the 8650 features
		trimmed_x = selector.transform(x)
		trimmed_test_x = selector.transform(test_x)

	if (selection=="kclusters"):
		kcluster_flist = joblib.load(kc_fn)
		trimmed_x = np.take(x, kcluster_flist, axis=1)
		trimmed_test_x = np.take(test_x, kcluster_flist, axis=1)

	n_samples, n_features = trimmed_x.shape
	# Linear SVM classifier
	if (classifier_type=="SVM"):
		clf = svm.SVC(kernel=kernel_type, degree=3, probability=True)
	# Gaussian Naive Bayes classifier
	if (classifier_type=="NB"):
		clf = GaussianNB()
	clf.fit(trimmed_x,y)

	result = clf.predict(trimmed_test_x)
	return result
def univariate_feature_selection(dataset, features):
	# load the dataset
	spreadsheet = Spreadsheet('../../Downloads/ip/project data.xlsx')
	data = Data(spreadsheet)
	targets = data.targets


	X = dataset
	y = data.targets


	###############################################################################
	plt.figure(1)
	plt.clf()

	X_indices = np.arange(X.shape[-1])

	###############################################################################
	# Univariate feature selection with F-test for feature scoring
	# We use the default selection function: the 10% most significant features
	selector = SelectPercentile(f_classif, percentile=10)
	selector.fit(X, y)
	scores = -np.log10(selector.pvalues_)
	scores /= scores.max()
	plt.bar(X_indices - .45, scores, width=.2,
	        label=r'Univariate score ($-Log(p_{value})$)', color='g')

	###############################################################################
	# Compare to the weights of an SVM
	clf = svm.SVC(kernel='linear')
	clf.fit(X, y)

	svm_weights = (clf.coef_ ** 2).sum(axis=0)
	svm_weights /= svm_weights.max()

	plt.bar(X_indices - .25, svm_weights, width=.2, label='SVM weight', color='r')

	clf_selected = svm.SVC(kernel='linear')
	clf_selected.fit(selector.transform(X), y)

	svm_weights_selected = (clf_selected.coef_ ** 2).sum(axis=0)
	svm_weights_selected /= svm_weights_selected.max()

	plt.bar(X_indices[selector.get_support()] - .05, svm_weights_selected,
	        width=.2, label='SVM weights after selection', color='b')


	x = np.arange(0, len(features))
	plt.title("Comparing feature selection")
	plt.xlabel('Feature number')
	plt.xticks(x, features, rotation=45)
	plt.yticks(())
	#plt.axis('tight')
	plt.legend(loc='upper right')
	plt.show()
def featureSelection(reduced_features,labels,clnd_features,percentile,n_components,results=False):
    """
        Parameters: 
            reduced_features = Unique feature names in python list after dropping non-numeric
            feaures. 
            labels = ground truth labels for the data points.
            clnd_features = data point features in numpy array format corresponding
            to the labels.
            percentile= the parameter for the SelectPercentile method;
            between 0.0-1.0.
            n_components = the n_components for the pca. 
            results = False returns python list of selected features. If True
            returns the metrics of the feature selectors (F-statistic, and p-values from
            f_classif) and the top 'n' pca component variance measurements. 
    
        Output: 
           Resulting list of feature from the SelectPercentile function and the 
           number of principle components used. If p_results = True then the 
           statistics of the SelectPercentile method using f_classif will be printed.
           In addition the explained variance of the top 'x' principle components will
           also be printed.
    """
    from sklearn.feature_selection import SelectPercentile, f_classif
    from sklearn.decomposition import PCA 
    from itertools import compress
    
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit_transform(clnd_features, labels)
    
    pca = PCA(n_components = n_components)
    pca.fit_transform(clnd_features, labels)
    
    if results == True:
    
        f_stat = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[0]),\
                         key = lambda x: x[1], reverse=True)
        
        p_vals = sorted(zip(reduced_features[1:],f_classif(clnd_features,labels)[1]),\
                        key = lambda x: x[1])
  
        expl_var = pca.explained_variance_ratio_
        
        return f_stat,p_vals,expl_var
    else:
        ## return a boolean index of the retained features 
        retained_features = selector.get_support()
        
        ## index the original features by the boolean index of top x% features 
        ## return a python list of the features to be used for training 
        features_list = list(compress(reduced_features[1:],retained_features))
    
        ## add back in the 'poi' to the first position in the final features list
        features_list.insert(0,'poi')
        
        return features_list
def eval(ds, testNum, p, splitProportion=0.2):
    #testNum=1
    #splitProportion=0.2
    
    allFeaturesF1=[]
    allFeaturesRecall=[]
    allFeaturesPrecision=[]
    
    featureSelctedF1=[]
    featureSelctedRecall = []
    featureSelctedPrecision = []
    
    for _ in range(testNum):
        tstdata, trndata = ds.splitWithProportion( splitProportion )
        X, Y = labanUtil.fromDStoXY(trndata)
        X_test, Y_test = labanUtil.fromDStoXY(tstdata)
        #localF1s = []
        #localRecalls = []
        #localPercisions = []
        for y, y_test in zip(Y, Y_test):
            if all(v == 0 for v in y):
                continue
            #clf = LinearSVC()#fit_intercept=True, C=p)
            #clf.sparsify()
            
            #clf = RandomForestClassifier()#criterion='entropy')
            #clf = tree.DecisionTreeClassifier()#max_depth=p)
            clf = AdaBoostClassifier()
            #clf = GradientBoostingClassifier()#, learning_rate=lr)
            #clf = ExtraTreesClassifier(n_estimators=p)
                        
            #svc = LinearSVC()
            #selector = RFE(estimator=svc, n_features_to_select=p*19, step=0.2)
            selector = SelectPercentile(chooser, percentile=p)
            
            selector.fit(X, y)
            name = str(clf).split()[0].split('(')[0]
            clf.fit(selector.transform(X), y)
            pred = clf.predict(selector.transform(X_test))
            
            featureSelctedF1.append(metrics.f1_score(y_test, pred))
            featureSelctedRecall.append(metrics.recall_score(y_test, pred))
            featureSelctedPrecision.append(metrics.precision_score(y_test, pred)) 
            
            clf.fit(X, y)
            pred = clf.predict(X_test)
            
            allFeaturesF1.append(metrics.f1_score(y_test, pred))
            allFeaturesRecall.append(metrics.recall_score(y_test, pred))
            allFeaturesPrecision.append(metrics.precision_score(y_test, pred))

    return np.mean(allFeaturesF1), np.mean(featureSelctedF1), \
        np.mean(allFeaturesRecall), np.mean(featureSelctedRecall), \
        np.mean(allFeaturesPrecision), np.mean(featureSelctedPrecision), \
        name
df_test_X = pd.read_csv('final_X_test.txt')
df_test_y = pd.read_csv('final_y_test.txt')

df_train_X.dropna(0, inplace=True)
df_test_X.dropna(0, inplace=True)


'''
#pca
pca = PCA(n_components=100)
df_train_X = pca.fit_transform(df_train_X)
df_test_X = pca.fit_transform(df_test_X)
'''

#feature selection varience 
select = SelectPercentile(percentile=90)
df_train_X = select.fit_transform(df_train_X,df_train_y)
df_test_X = select.transform(df_test_X)


#feature scaling
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
df_train = min_max_scaler.fit_transform(df_train_X)
df_test = min_max_scaler.fit_transform(df_test_X)

#training 
svm=SVC(gamma=0.1, kernel='rbf', C=3)
svm.fit(df_train, df_train_y)   

#prediction
y_train_predicted=svm.predict(df_train)
Example #32
0
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    # authors_file_handler = open(authors_file, "r")
    # authors = pickle.load(authors_file_handler)
    # authors_file_handler.close()
    
    authors_file_handler = open(authors_file, "rb")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()


    # words_file_handler = open(words_file, "r")
    # word_data = cPickle.load(words_file_handler)
    # words_file_handler.close()
    
    words_file_handler = open(words_file, "rb")
    word_data = pickle.load(words_file_handler)
    words_file_handler.close()


    ### test_size is the percentage of events assigned to the test set
    ### (remainder go into training)
    # features_train, features_test, labels_train, labels_test = sklearn.cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
    features_train, features_test, labels_train, labels_test = sklearn.model_selection.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=10)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    # print "no. of Chris training emails:", sum(labels_train)
    print("\nno. of Chris training emails: - sum(labels_train) - {}".format(sum(labels_train)))

    # print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    print("no. of Sara training emails: - sum(labels_train) - {}\n".format(len(labels_train) - sum(labels_train)))
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #33
0
def predict(self, X):
    X_transformed = X
    for step in self.steps[:-1]:
        X_transformed = step[1].transform(X_transformed)
    return self.steps[-1][1].predict(X_transformed)


#####정보누설
#무작위 데이터 생성
import numpy as np
rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100,10000))
y = rnd.normal(size=(100,))

from sklearn.feature_selection import SelectPercentile, f_regression
select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y)
X_selected = select.transform(X)
print("X_selected shape : {}".format(X_selected.shape))

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
print("cross val score(릿지) : {:.3f}".format(np.mean(
        cross_val_score(Ridge(), X_selected, y, cv=5))))
#무작위 데이터라 연관이 없을텐데 R^2값이 0.91로 좋게 나옴 > 전체 데이터 사용 때문
pipe = Pipeline([("select", SelectPercentile(score_func=f_regression,percentile=5)), 
                 ("ridge", Ridge())])
print("cross val score : {:.3f}".format(np.mean(cross_val_score(pipe, X, y, cv=5))))
#파이프라인 사용시 R^2값 음수 > 정보누설 막음

##make_pipeline : 단계 이름을 자동으로 생성
from sklearn.pipeline import make_pipeline
Example #34
0
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator, ZeroCount
from xgboost import XGBRegressor
import pickle  #add pickle
from sklearn.metrics import r2_score

_data = open("data_BA.pkl", "rb")
X, y = pickle.load(_data)
_data.close()

# Average CV score on the training set was: -3.2532849505281343
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=89),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=50,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
Example #35
0
class multiple_classifiers_with_pruned_tree(abstract_classifier):
    def __init__(self, data, labels, **kwargs):
        self.args = kwargs
        self.ada = AdaBoostClassifier(n_estimators=50)
        self.tree = tree.DecisionTreeClassifier(max_depth=8)
        self.knn = KNeighborsClassifier(n_neighbors=1)

        self.sp_knn = SelectPercentile(percentile=24)
        self.sp_tree = SelectPercentile(percentile=kwargs['tree_per'])
        self.sp_ada = SelectPercentile(percentile=85)

        data_knn = self.sp_knn.fit_transform(data, labels)
        data_tree = self.sp_tree.fit_transform(data, labels)
        data_ada = self.sp_ada.fit_transform(data, labels)

        self.knn.fit(data_knn, labels)
        self.ada.fit(data_ada, labels)

        # Fit pruned tree
        validation_size = 100
        train_data = data_tree[:validation_size]
        train_labels = labels[:validation_size]
        validation_data = data_tree[validation_size:]
        validation_labels = labels[validation_size:]
        self.tree.fit(train_data, train_labels)
        self.prune(self.tree, 0, validation_data, validation_labels)

    def prune(self, tree_obj, index, validation_data, validation_labels):
        # based on https://stackoverflow.com/a/49496027
        inner_tree = tree_obj.tree_
        left_child = inner_tree.children_left[index]
        right_child = inner_tree.children_right[index]
        if left_child != -1:
            self.prune(tree, left_child, validation_data, validation_labels)
        if right_child != -1:
            self.prune(tree, right_child, validation_data, validation_labels)
        predictions_no_prune = tree_obj.predict(validation_data)
        errors_no_prune = (predictions_no_prune ^ validation_labels).sum()

        inner_tree.children_left[index] = -1
        inner_tree.children_right[index] = -1
        predicitions_prune = tree_obj.predict(validation_data)
        errors_prune = (predicitions_prune ^ validation_labels).sum()

        if errors_prune > errors_no_prune:
            inner_tree.children_left[index] = left_child
            inner_tree.children_right[index] = right_child

    def classify(self, features):
        features_mat = features.reshape((1, -1))
        features_knn = self.sp_knn.transform(features_mat)
        features_tree = self.sp_tree.transform(features_mat)
        features_ada = self.sp_ada.transform(features_mat)

        w1 = self.args.get('w1', 1)
        w2 = self.args.get('w2', 1)
        w3 = self.args.get('w3', 1)

        p1 = int(self.knn.predict(features_knn)[0])
        p2 = int(self.ada.predict(features_ada)[0])
        p3 = int(self.tree.predict(features_tree)[0])

        avg = (w1*p1 + w2*p2 + w3*p3)/(w1 + w2 + w3)
        return bool(np.round(avg))
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_regression
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=42)

# Average CV score on the training set was: -0.9547903226888407
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=12), MaxAbsScaler(),
    LassoLarsCV(normalize=False))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.8326392221287445
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            make_union(
                make_union(
                    FunctionTransformer(copy),
                    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.35000000000000003, min_samples_leaf=1, min_samples_split=7, n_estimators=100))
                ),
                make_union(
                    FunctionTransformer(copy),
                    FunctionTransformer(copy)
                )
            ),
            SelectPercentile(score_func=f_classif, percentile=58)
        ),
        FunctionTransformer(copy)
    ),
    MultinomialNB(alpha=0.1, fit_prior=True)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #38
0
# clf = tree.DecisionTreeClassifier(criterion='entropy')
clf = tree.DecisionTreeClassifier(criterion='gini')
print(clf)
clf.fit(x_train, y_train)
# print clf.feature_importances_
for m in range(len(clf.feature_importances_)):
    if clf.feature_importances_[m]>0.005:
        print "feature_importance",m,clf.feature_importances_[m]
# np.savetxt("feat_importance.txt",clf.feature_importances_)
'''''write the tree to zhe file '''
with open("tree.dot", 'w') as f:
    f = tree.export_graphviz(clf, out_file=f)

''''' Number as zhe impact of feature,bigger better'''
from sklearn.feature_selection import SelectPercentile,f_classif
selector = SelectPercentile(f_classif,percentile=1)
selector.fit(x_train, y_train)
result_dic={}
num=0
# f=open("string_data.txt")
# lines=f.readlines()
# for line in lines:
#     result_dic[line]=selector.pvalues_[num]
#     num=num+1
# num2=0
# for line in lines:
#     if result_dic[line]<0.001:
#         print "rs;",line,"pvalues:",selector.pvalues_[num2],"num:",num2
#     num2=num2+1
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
print("First line after cleanup from test Data: ", linesOfTrainData[0])

features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    linesOfTrainData,
    linesOfTestData,
    test_size=0.1,
    train_size=0.9,
    random_state=42)

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
linesOfTrainData_Transformed = vectorizer.fit_transform(features_train)
linesOfTestData_Transformed = vectorizer.transform(features_test)

selector = SelectPercentile(f_classif, percentile=10)
selector.fit(linesOfTrainData_Transformed, labels_train)  #labels_train
linesOfTrainData_Transformed = selector.transform(linesOfTrainData_Transformed)
linesOfTestData_Transformed = selector.transform(linesOfTestData_Transformed)

f = open('TestData/Test/format_out.dat', 'w')
for vt in linesOfTestData_Transformed:
    cosineSimilarityValues = []
    for vS in linesOfTrainData_Transformed:
        dotProduct = vt.dot(np.transpose(vS))
        lengtht = np.linalg.norm(vt.data)
        lengthS = np.linalg.norm(vS.data)

        #handle exceptions

        if lengthS != 0 and lengtht != 0:
    clf.fit(X_train, y_train)
    #y_margins = clf.decision_function(X_devel)
    '''
    y_prob = (y_margins - y_margins.min()) / (y_margins.max() - y_margins.min())
    y_prob = 1./(1 + np.exp(-y_margins))
    '''
    y_prob_devel = clf.predict_proba(X_devel)
    y_prob_test = clf.predict_proba(X_test)
    y_pred = clf.predict(X_test)
    np.save('./predictions/SD_devel_svm_baseline.npy', y_prob_devel)
    np.save('./predictions/SD_test_svm_baseline.npy', y_prob_test)
else:
    uar = []
    percentile = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    for p in percentile:
        selection = SelectPercentile(f_classif, percentile=p)
        feat_selected = selection.fit_transform(X_train, y_train)
        feat_devel = selection.transform(X_devel)
        print('\nComplexity {0:.6f}'.format(optimum_complexity))
        #clf = svm.LinearSVC(C=optimum_complexity, random_state=0)
        clf = svm.SVC(C=optimum_complexity, kernel='linear', random_state=0)
        clf.fit(feat_selected, y_train)
        y_pred = clf.predict(feat_devel)
        uar.append(
            recall_score(y_devel, y_pred, labels=classes, average='macro'))
        print('UAR on Devel {0:.1f}'.format(uar[-1] * 100))
        if show_confusion:
            print('Confusion matrix (Devel):')
            print(classes)
            print(confusion_matrix(y_devel, y_pred, labels=classes))
    optimum_percentile = percentile[np.argmax(uar)]
Example #41
0
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import StandardScaler
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8395341050959029
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=10, p=2, weights="uniform")),
    SelectPercentile(score_func=f_classif, percentile=90),
    SelectPercentile(score_func=f_classif, percentile=87),
    StackingEstimator(estimator=GaussianNB()),
    StandardScaler(),
    BernoulliNB(alpha=0.1, fit_prior=True)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
#KNeighborsClassifier
steps = [('scaler', MinMaxScaler()), ('red_dim', PCA()),
         ('clf', KNeighborsClassifier())]

pipeline = Pipeline(steps)

parameteres = [{
    'scaler': scalers_to_test,
    'red_dim': [PCA(random_state=42)],
    'red_dim__n_components': n_features_to_test,
    'clf__n_neighbors': k,
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}, {
    'scaler': scalers_to_test,
    'red_dim': [SelectPercentile(f_classif, percentile=10)],
    'clf__n_neighbors': k,
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}, {
    'scaler':
    scalers_to_test,
    'red_dim': [SelectPercentile(mutual_info_classif, percentile=10)],
    'clf__n_neighbors':
    k,
    'clf__weights': ['uniform', 'distance'],
    'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}, {
    'scaler': scalers_to_test,
    'red_dim': [None],
    'clf__n_neighbors': k,
Example #43
0
def predict_NC():
    #feature selection
    X, y, vectorizer = get_X_y()
    #selector = SelectKBest(f_classif,10000)
    selector = SelectPercentile(f_classif,percentile=100)
    selector.fit(X,y)
    best_indices = selector.get_support(indices=True)
    best_features = np.array(vectorizer.get_feature_names())[best_indices]
    X = selector.transform(X)

    #use cross validation to choose the best parameter
    lr = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto')
    kf = StratifiedKFold(y,n_folds=5,shuffle=True)
    parameters = {"C":[1.0,.1, .01, .001,0.0001]}
    clf0 = GridSearchCV(lr, parameters,scoring='roc_auc',cv=kf)
    print "fitting model..."
    clf0.fit(X,y)
    print "best auc score is: " ,clf0.best_score_
    print "done."

    #cross validation on the best parameter
    #get precision recall accuracy auc_score
    fs, aucs,prec,rec = [],[],[],[]
    fold = 0
    complete_X = X.tocsr()
    clf = LogisticRegression(penalty="l2", fit_intercept=True,class_weight='auto',C=clf0.best_estimator_.C)
    for train, test in kf:
        clf.fit(complete_X[train,:].tocoo(), y[train])
        probs = clf.predict_proba(complete_X[test,:])[:,1]
        average_precision_score(y[test],probs)
        precision,recall,threshold = precision_recall_curve(y[test],probs)

        accuracy = clf.score(complete_X[test,:], y[test])

        predLabel = clf.predict(X[test,:])
        rec.append(recall_score(y[test],predLabel))
        prec.append(precision_score(y[test],predLabel))
        #aucs.append(sklearn.metrics.roc_auc_score(y[test], probs))
        cur_auc = auc_score(y[test], probs)
        aucs.append(cur_auc)
        #preds = clf.predict(complete_X[test])
        #fs.append(f1_score(y[test], preds))
        '''
        if fold == 0:
            plt.clf()
            plt.plot(precision,recall,label='Precision-Recall curve for news coverage prediction')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0,1.05])
            plt.xlim([0.0,1.0])
            plt.title('Precision-Recall curve for news coverage prediction with vocabulary size %d' %len(best_features))
            plt.show()
        fold += 1
        '''

        if fold == 0:
            fpr, tpr, thresholds = roc_curve(y[test], probs)
            pylab.clf()
            fout = "NC/roc"

            pylab.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % cur_auc)
            pylab.plot([0,1], [0,1], 'k--')
            pylab.xlim((-0.025,1.025))
            pylab.ylim((-0.025,1.025))
            pylab.xlabel("false positive rate")
            pylab.ylabel("true positive rate")
            pylab.title("ROC curve for news coverage prediction(area = %0.2f)" % cur_auc)
            pylab.tight_layout()
            pylab.savefig(fout)
        fold += 1

    #print "average auc: %s" % (sum(aucs)/float(len(aucs)))
    #print "average fs: %s" % (sum(fs)/float(len(fs)))
    print "average recall: %s" % (sum(rec)/float(len(rec)))
    print "average precision: %s" % (sum(prec)/float(len(prec)))
    #print "ABOUT TO RETURN"
    #pdb.set_trace()
    texify_most_informative_features(best_features,vectorizer, clf0)
    return clf0
Example #44
0
    svc, fmri_masked, conditions, cv=cv, groups=session_label)[1]
print("Permutation test score: {:.3f}".format(null_cv_scores.mean()))

###########################################################################
# Decoding without a mask: Anova-SVM in scikit-lean
# --------------------------------------------------
# We can also implement feature selection before decoding as a scikit-learn
# `pipeline`(:class:`sklearn.pipeline.Pipeline`). For this, we need to import
# the :mod:`sklearn.feature_selection` module and use
# :func:`sklearn.feature_selection.f_classif`, a simple F-score
# based feature selection (a.k.a. `Anova <https://en.wikipedia.org/wiki/Analysis_of_variance#The_F-test>`_),
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
feature_selection = SelectPercentile(f_classif, percentile=10)
anova_svc = Pipeline([('anova', feature_selection), ('svc', LinearSVC())])
# We can use our ``anova_svc`` object exactly as we were using our ``svc``
# object previously.
# As we want to investigate our model, we use sklearn `cross_validate` function
# with `return_estimator = True` instead of cross_val_score, to save the estimator

fitted_pipeline = cross_validate(anova_svc, fmri_masked, conditions,
                                 cv=cv, groups=session_label, return_estimator=True)
print(
    "ANOVA+SVC test score: {:.3f}".format(fitted_pipeline["test_score"].mean()))

###########################################################################
# Visualize the ANOVA + SVC's discriminating weights
# ...................................................
Example #45
0
        # Create the training data and label
        # We need to take the balanced data
        training_data = [arr for idx_arr, arr in enumerate(data_bal)
                         if idx_arr != idx_lopo_cv]
        training_label = [arr for idx_arr, arr in enumerate(label_bal)
                         if idx_arr != idx_lopo_cv]
        # Concatenate the data
        training_data = np.vstack(training_data)
        training_label = np.ravel(label_binarize(
            np.hstack(training_label).astype(int), [0, 255]))
        print 'Create the training set ...'

        # Perform the classification for the current cv and the
        # given configuration
        # Feature selector
        sel = SelectPercentile(f_classif, p)
        training_data = sel.fit_transform(training_data, training_label)
        testing_data = sel.transform(testing_data)
        crf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
        pred_prob = crf.fit(training_data, training_label).predict_proba(
            testing_data)

        results_cv.append([pred_prob, crf.classes_])
        feat_imp_cv.append(sel.get_support(indices=True))

    results_p.append(results_cv)
    feat_imp_p.append(feat_imp_cv)

# Save the information
path_store = '/data/prostate/results/mp-mri-prostate/exp-3/selection-extraction/anova/t2w'
if not os.path.exists(path_store):
train_row = dec.transform(train["device_id"])
train_sp = sparse_matrix[train_row, :]

test_row = dec.transform(test["device_id"])
test_sp = sparse_matrix[test_row, :]

X_train, X_val, y_train, y_val = train_test_split(train_sp,
                                                  Y,
                                                  train_size=.90,
                                                  random_state=10)

##################
#   Feature Sel
##################
print("# Feature Selection")
selector = SelectPercentile(f_classif, percentile=23)

selector.fit(X_train, y_train)

X_train = selector.transform(X_train)
X_val = selector.transform(X_val)

train_sp = selector.transform(train_sp)
test_sp = selector.transform(test_sp)

print("# Num of Features: ", X_train.shape[1])

##################
#  Build Model
##################
# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))

# add noise features to the data
# the first 30 features are from the dataset, the next 50 are noise
X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(X_w_noise,
                                                    cancer.target,
                                                    random_state=0,
                                                    test_size=.5)

# use f_classif (the default) and SelectPercentile to select 50% of features
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
# transform training set
X_train_selected = select.transform(X_train)

print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))

mask = select.get_support()
print(mask)

# transform test data
X_test_selected = select.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train, y_train)
                            n_folds=i,
                            shuffle=True,
                            random_state=1)
    scores = cross_val_score(regression,
                             wine,
                             quality,
                             scoring="mean_squared_error",
                             cv=crossvalidation,
                             n_jobs=1)
    print("Folds: %i, mean squared error: %.2f std: %.2f" %
          (len(scores), np.mean(np.abs(scores)), np.std(scores)))

#the mean quared error is still the same. we need feature seletion  to see if we can get better results

#print all f_scores for each feature
f_selector = SelectPercentile(f_regression, percentile=25)
f_selector.fit(wine, quality)
for feature, score in zip(wine.columns.values, f_selector.scores_):
    print("F-Score: %3.2f\t for feature %s" % (score, feature))
"""
we can see that some features are not important for the regression
with a greedy search we can get the optimal number of features
"""
greedy = RFECV(estimator=regression, cv=13, scoring="mean_squared_error")
greedy.fit(wine, quality)
print("Optimal number of features: %d" % greedy.n_features_)

#however i wanna test logistic regression now because y data look like data to be classified. We might get better results
logistic = LogisticRegression()
ovr = OneVsRestClassifier(LogisticRegression()).fit(x_train, y_train)
ovo = OneVsOneClassifier(LogisticRegression()).fit(x_train, y_train)
Example #49
0
from newspaper import Article

urls = [
    'http://www.newsmax.com/Politics/putin-tv-trump-dangerous/2017/04/17/id/784706/',
    'http://www.hollywoodreporter.com/heat-vision/star-wars-rare-archival-footage-shown-at-celebration-had-funny-new-hope-f-bomb-994552?utm_source=feedburner&utm_medium=feed&utm_campaign=Feed%3A+thr%2Ffilm+%28The+Hollywood+Reporter+-+Movies%29&utm_content=FeedBurner',
    'http://www.espn.com/sports/endurance/story/_/id/19177433/boston-marathon-2017-devin-wang-another-year-brings-closure-tragedy'
]

prediction_data = []
for url in urls:

    article = Article(url)
    article.download()
    article.parse()
    soupText = BeautifulSoup(article.text)
    prediction_data.append(soupText.get_text())

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train_vect)
features_test_transformed = vectorizer.transform(prediction_data)
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(features_train_transformed, labels_train)
features_test_transformed = selector.transform(
    features_test_transformed).toarray()

pred = clf.predict(features_test_transformed)

print pred
Example #50
0
pct = 0.8  # percent of edges kept in feature selection
alphas = 10**np.linspace(10, -2, 100) * 0.5  # specify alphas to search
#%%
rg_grid = GridSearchCV(Ridge(normalize=False),
                       cv=10,
                       param_grid={'alpha': alphas},
                       iid=False)
# using LASSO regression instead of ridge
lasso = linear_model.Lasso
lasso_grid = GridSearchCV(lasso(normalize=False),
                          cv=10,
                          param_grid={'alpha': alphas},
                          iid=False)

reg = Pipeline([('feature_selection',
                 SelectPercentile(f_regression, percentile=pct)),
                ('regression', lasso_grid)])

cv10 = KFold(n_splits=21)  #, random_state=665)
rpcv10 = RepeatedKFold(n_splits=3, n_repeats=3, random_state=665)
# %% Run model
start = time.time()  # time the function
all_pred = cross_val_predict(reg, vecs_reshape.T, y, cv=cv10, n_jobs=4)
#all_score = cross_val_score(reg, vecs_reshape.T, y, cv=rpcv10, n_jobs=1) # repeated kfolds
end = time.time()
print(end - start)  # print function running time

# %%
print(np.corrcoef(all_pred.T, y.T))

# %%
Example #51
0
def getBestModel(N,
                 xDf,
                 yDf,
                 emptyModel,
                 paramGrid,
                 features,
                 doSelection=True):
    """
    inputs: N - int - the number of times the model should be trained and evaluated.
            xDf - pandas dataframe - the rows represent the data points, the columns represent the features. These
                                         are the inputs into the model
            yDf - pandas dataframe - the rows represent the data points, there is only one column. This contains the
                                         the target values for the model.
            emptyModel - sklearn model - a valid sci-kit learn model with a 'fit' method.
            paramGrid - dictionary - the para_grid to be used with this model in a grid search. Note that each parameter name
                                     in the grid must start with 'model__' (two underscores).
            features - int or float - if int, then use SelectKBest where k='features'. If float, use SelectPercentile 
                                      where 'features' is the percentage
            testSize - float - the percentage of the data that should be used for the testing set (if method=='split')
            doSelection - boolean - if true, then do feature selection. Otherwise, do not do feature selection.
    outputs: modelsList - the list of all 10 trained models.
             metricsDict - dictionary of the form {mae: [val1, val2,...val10], mape: [###],...}. The index of each model in 
                 'trainedModelList' matches the index of the values in each list.
             
    NOTE: This assumes the data in xDf has been standardized or normalized before being used in this function.
    
    NOTE: It may be more efficient to do the feature selection and standardization before the doing the N-fold cv. I checked it,
    and it did choose the same features for every fold for my experiments, but it would be better to do this before the cv in case
    different folds chose different features.
    """

    # initialize the dictionary that will have contain the evaluation results of all 10 models.
    # It will look like {'mae': [val1, val2,..., val10], 'rmse': ...)
    metricsDict = {
        'mae': [],
        'mape': [],
        'rmse': [],
        'r': [],
        'rSq': [],
        'explainedVariance': []
    }

    # get the input features in the correct format
    X = xDf.values
    # put the target values in the correct format
    columnName = yDf.columns[0]
    y = []

    # make the dataframe 'y' into a list of values
    for i in range(len(yDf.index)):  # loop through every row of the dataframe
        y.append(yDf.iloc[i, 0])

    # convert the list to a numpy array
    y = np.asarray(y)

    # make the cv settings
    cv = KFold(n_splits=N, shuffle=True)

    # standardization
    standardScaler = preprocessing.StandardScaler()

    # apply standardization
    X = standardScaler.fit_transform(X)

    if doSelection:
        # feature selection
        if type(features) == int:
            X = SelectKBest(f_regression, k=features).fit_transform(X, y)
        elif type(features) == float:
            featuresPercentile = features / 100.0
            X = SelectPercentile(f_regression,
                                 percentile=featuresPercentile).fit_transform(
                                     X, y)
        else:
            raise ValueError(
                "The input 'features' is not an integer or a float.")

    # initialize list of trained models
    modelsList = []

    # for every fold
    for train_index, test_index in cv.split(X):

        # get the train and test data
        xTrain, xTest, yTrain, yTest = X[train_index], X[test_index], y[
            train_index], y[test_index]

        # do a grid search and K-fold cross validation
        numFolds = 5  # 5-Fold cross validation

        pipe = Pipeline(steps=[('model', emptyModel)])

        # make the model with optimized hyperparameters via a grid search with cross validation
        model = GridSearchCV(estimator=pipe,
                             param_grid=paramGrid,
                             cv=KFold(n_splits=numFolds, shuffle=True),
                             scoring='r2',
                             return_train_score=False)

        # fit model
        model.fit(xTrain, yTrain)

        # add the model to the list
        modelsList.append(model)

        # get predictions
        pred = model.predict(xTest)
        trainPred = model.predict(xTrain)

        # find errors
        meanAbsoluteError = mean_absolute_error(yTest, pred)
        rootMeanSquaredError = np.sqrt(mean_squared_error(yTest, pred))
        meanAbsPercError = mean_absolute_percentage_error(yTest, pred)
        trainMeanAbsoluteError = mean_absolute_error(yTrain, trainPred)
        trainRootMeanSquaredError = np.sqrt(
            mean_squared_error(yTrain, trainPred))
        trainMeanAbsPercError = mean_absolute_percentage_error(
            yTrain, trainPred)

        # find the R^2 values (coefficient of determination)
        rSq = r2_score(yTest, pred)
        trainRSq = r2_score(yTrain, trainPred)

        # find the R values (Pearson coefficient of correlation)
        R = np.corrcoef(yTest, pred)[0][1]
        trainR = np.corrcoef(yTrain, trainPred)[0][1]

        # find explained variance
        explainedVar = explained_variance_score(yTest, pred)
        trainExplainedVar = explained_variance_score(yTest, pred)

        # add the metrics to metricsDict
        metricsDict['mae'].append(round(meanAbsoluteError * 2000, 3))
        metricsDict['rmse'].append(round(rootMeanSquaredError * 2000, 3))
        metricsDict['mape'].append(round(meanAbsPercError, 3))
        metricsDict['rSq'].append(round(rSq, 3))
        metricsDict['r'].append(round(R, 3))
        metricsDict['explainedVariance'].append(round(explainedVar, 3))

    ## return the results
    return modelsList, metricsDict
Example #52
0
    f.readline()
    csvreader = csv.reader(f, delimiter='\t')
    for row in csvreader:
        ID_test.append(row[0])
        X_test.append(row[2])

# Use word and character features
words = TfidfVectorizer(analyzer="word",
                        binary=False,
                        use_idf=True,
                        stop_words="english",
                        min_df=3)
char = TfidfVectorizer(analyzer="char", binary=False, use_idf=True)

# Use percentile-based feature selection
select = SelectPercentile(score_func=chi2)

# Stack the features together
feat = FeatureUnion([('words', words), ('char', char)])

# Construct transformation pipeline
text_clf = Pipeline([
    ('feat', feat),
    # ('select', select),
    # ('clf', MultinomialNB()),
    ('clf', SGDClassifier(penalty='l2'))
])

# Set the parameters to be optimized in the Grid Search
parameters = {
    'feat__words__ngram_range': [(1, 5), (1, 6)],
Example #53
0
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def tokenize(text):
    stemmer = PorterStemmer()
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


training_data = datasets.load_files(sys.argv[1],
                                    encoding="utf-8",
                                    decode_error='ignore')
bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                          tokenizer=tokenize,
                                          stop_words='english')

selector = SelectPercentile(chi2, 25)

print("\nSVM\n")
clf = LinearSVC(penalty="l2", dual=False, C=5.0)

pipe_clf = Pipeline([('vectorizer', bigram_tfidf_vectorizer),
                     ('selector', selector), ('classifier', clf)])

pipe_clf.fit(training_data.data, training_data.target)
joblib.dump(pipe_clf, sys.argv[2])
Example #54
0
    train_reader = csv.reader(train_csv)
    cnt = 0
    for tweet in train_reader:
        attr = tweet[CURRENT_ATTRIBUTE + 4]
        train_attrs.append(attr)
        cnt += 1
    del train_attrs[0]

    # get y_train from train_attrs
    y_train = [[float(attr)] for attr in train_attrs]
    # chi-2 select features
    print "start feature selection"
    if (SELECTOR == 0):
        selector = SelectKBest(chi2, k=K_FOR_BEST)
    else:
        selector = SelectPercentile(score_func=chi2,
                                    percentile=SELECT_PERCENTILE)
    selector.fit(x_train, y_train)
    new_x_train = selector.transform(x_train)
    new_x_test = selector.transform(x_test)
    print "feature selection done"
    # convert y_train to right dimension
    # y_train = [attr[0] for attr in y_train]

    # regression
    print "start regression"
    clf = LinearRegression()
    clf = clf.fit(new_x_train, y_train)
    result = clf.predict(new_x_test)
    print "regression done"

    for item in result:
Example #55
0
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.feature_selection import SelectKBest, SelectPercentile

# 5.1 MI for classification 
# get the MI info between each feature and the target
mutual_info_classif(X_train, y_train) # 注意这个是专门classification

# Use the SelectKBest method to select the TOP K variables 
selector = SelectKBest(mutual_info_classif, k = 10).fit(X_train, y_train)
X_train.columns[selector.get_support()]

# 5.2 MI for regression
mutual_info_regression(X_train, y_train)
# Select the top 10 percentile 注意这里是比例上面是个数
selector = SelectPercentile(mutual_info_regression, percentile = 10).fit(X_train, y_train)
X_train.columns[selector.get_support()]


#### 6. Fischer Score | Chi Square

# 1. Measure the dependecy of 2 variables
# 2. Suited for Categoriacl Variables
# 3. Target should be binary
# 4. Variable values should be non-negative, and typically boolean, frequencies or count
# 5. It compares the observed distribution class with the different labels against the expected one, would there be no labels
# 没怎么看懂

from sklearn.feature_selection import chi2

f_score = chi2(X_train, y_train)
Example #56
0
        stemmer.stem(word)
        for word in re.sub('[^a-zA-Z]', ' ', data_frame['Text'][i]).split()
        if not word in stopwords_set
    ]).lower()
    document.append(new_text)

for i in range(0, df_x_test.shape[0]):
    new_text_2 = ' '.join([
        stemmer.stem(word)
        for word in re.sub('[^a-zA-Z]', ' ', df_x_test['Text'][i]).split()
        if not word in stopwords_set
    ]).lower()
    document_test.append(new_text_2)

df_x_test['removed_test'] = df_x_test['Text'].apply(lambda x: " ".join([
    stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split()
    if i not in words
]).lower())
tfid_v = TfidfVectorizer(sublinear_tf=True, min_df=6, stop_words='english')
#select_features = SelectKBest(chi2, k=2000)
select_features = SelectPercentile(chi2, percentile=11.5)
X = tfid_v.fit_transform(document).toarray()
X_test = tfid_v.transform(document_test).toarray()
y = data_frame.iloc[:, 0].values
X = select_features.fit_transform(X, y)
X_test = select_features.transform(X_test)
classifier = LinearSVC(C=1.0, penalty='l1', max_iter=4500, dual=False)
classifier.fit(X, y)
y_pred = classifier.predict(X_test)
np.savetxt('new.txt', y_pred, delimiter=" ", fmt="%s")
X_test1 = numpy.concatenate((X_test_temp, X_test.iloc[:, 10:c - 1]), axis=1)

scaled_features_train_df = pd.DataFrame(X_train1,
                                        index=X_train.index,
                                        columns=X_train.columns)
scaled_features_test_df = pd.DataFrame(X_test1,
                                       index=X_test.index,
                                       columns=X_test.columns)

# --------------
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif

# Write your solution here:

skb = SelectPercentile(score_func=f_classif, percentile=20)
predictors = skb.fit_transform(X_train1, Y_train)
scores = list(skb.scores_)
print(scaled_features_train_df.columns)
top_k_index = sorted(range(len(scores)), key=lambda i: scores[i],
                     reverse=True)[:predictors.shape[1]]
top_k_predictors = [scaled_features_train_df.columns[i] for i in top_k_index]

print(top_k_predictors)

# --------------
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
clf = OneVsRestClassifier(LogisticRegression())
clf1 = OneVsRestClassifier(LogisticRegression())
Example #58
0
 def __init__(self, data, labels, **kwargs):
     self.knn = KNeighborsClassifier(n_neighbors=1)
     self.sp_knn = SelectPercentile(percentile=kwargs['feature_percentile'])
     data_knn = self.sp_knn.fit_transform(data, labels)
     self.knn.fit(data_knn, labels)
class GenreClassifier():
    def __init__(self, dataset='news'):
        '''
        dataset = {'news', 'bruk'}
        '''
        self.tfidf = None
        self.clf = None
        self.sel_perc = None
        if dataset == 'news':
            self.load('saves\\clf_news_MNB_poslex.pkl',
                      'saves\\tfidf_news_MNB_poslex.pkl',
                      'saves\\featsel_news_MNB_poslex.pkl')
        elif dataset == 'bruk':
            self.load('saves\\clf_bruk_MNB_poslex.pkl',
                      'saves\\tfidf_bruk_MNB_poslex.pkl')
            pass
        pass

    def init(self):
        freq_words = pd.read_csv('data\\freq_words.txt',
                                 sep=' ',
                                 header=None,
                                 names=['word', 'freq'])['word'].values
        lexngrams = np.loadtxt('data\\news_bigrams.txt',
                               dtype=object,
                               encoding='utf-8')
        print('Frequent words count:', len(freq_words))
        print('Lexical ngrams count:', len(lexngrams))
        self.factory = ClfFactoryPosLex(
            None,
            PosFreqWordsAnalyzer(Morphology().getAnalyzer(),
                                 list(freq_words),
                                 lemmatize_freq=True), lexngrams)

    def train(self, X, y, percentile=0):
        self.tfidf = self.factory.make_vectorizer()
        self.clf = self.factory.make_classifier()
        self.sel_perc = SelectPercentile(
            mutual_info_classif, percentile) if percentile >= 1 else None
        vtrain = self.tfidf.fit_transform(X)
        if self.sel_perc is not None:
            vtrain = self.sel_perc.fit_transform(vtrain, y)
        self.clf.fit(vtrain, y)

    def save(self, clf_name, tfidf_name, sel_perc_name=None):
        joblib.dump(self.clf, clf_name)
        joblib.dump(self.tfidf, tfidf_name)
        if not self.sel_perc is None:
            joblib.dump(self.sel_perc, sel_perc_name)

    def load(self, clf_name, tfidf_name, sel_perc_name=None):
        self.clf = joblib.load(clf_name)
        self.tfidf = joblib.load(tfidf_name)
        if not sel_perc_name is None:
            self.sel_perc = joblib.load(sel_perc_name)

    def predict(self, raw_strings, predict_proba=False):
        vtest = self.tfidf.transform(raw_strings)
        if self.sel_perc is not None:
            vtest = self.sel_perc.transform(vtest)
        predictor = self.clf.predict_proba if predict_proba else self.clf.predict
        y_predicted = predictor(vtest.toarray())
        return y_predicted
Example #60
0
# Here, the scaling is done properly during the grid search, instead
# of the whole training set being used, it uses the part of the
# training set it uses for training the different models for cross validation

mglearn.plots.plot_proper_processing()
plt.show()
# Illustration of proper preprocessing

rnd = np.random.RandomState(seed=0)
X = rnd.normal(size=(100, 10000))
y = rnd.normal(size=(100, ))
# To illustrate information leakage, we start with 100 samples
# randomly chosen with 10,000 fatures. Because the data is just
# noise we should not be able to learn from it

select = SelectPercentile(score_func=f_regression, percentile=5).fit(X, y)
X_selected = select.transform(X)
print 'X_selected.shape: {}'.format(X_selected.shape)
# first select the most 500 relevant features
print 'Cross validation accuracy (cv only on Ridge): {:.3f}'.format(
    np.mean(cross_val_score(Ridge(), X_selected, y, cv=5)))
# The R^2 score is 0.91, this cannot be right since the data is just random.
# This is because we did preprocessing on the data outside the cross
# validation.

pipe = Pipeline([
    ('select', SelectPercentile(score_func=f_regression, percentile=5)),
    ('ridge', Ridge()),
])
print 'Cross validation score accuracy (pipeline): {:.3f}'.format(
    np.mean(cross_val_score(pipe, X, y, cv=5)))