Ejemplo n.º 1
0
    def sgdfeature(self,data):

        newdata = pd.DataFrame()

        preproc = Pipeline([('fh',FeatureHasher( n_features=2**20,input_type='string'))])
        ##for SGDClassifier
        newdata['app_id_specs'] = data['app_id'].values+data['app_domain'].values+data['app_category'].values
        newdata['app_dom_specs'] = data['app_domain'].values+data['app_category'].values
        newdata['site_id_specs'] = data['site_id'].values+data['site_domain'].values+data['site_category'].values
        newdata['site_dom_specs'] = data['site_domain'].values+data['site_category'].values
        # data['device'] = data['device_model'].values+(data['device_type'].values.astype(str))+(data['device_conn_type'].values.astype(str))
        newdata['type'] = data['device_type'].values +data['device_conn_type'].values
        newdata['domain'] = data['app_domain'].values +data['site_domain'].values
        newdata['category'] = data['app_category'].values+data['site_category'].values
        newdata['pos_cat'] =  data['banner_pos'].values.astype(str)+data['app_category'].values+data['site_category'].values
        newdata['pos_dom'] =  data['banner_pos'].values.astype(str)+data['app_domain'].values+data['site_domain'].values
        # data['pos_id'] =  data['banner_pos'].values.astype(str)+data['app_id'].values+data['site_id'].values

        newdata['hour'] = data['hour'].map(lambda x: datetime.strptime(x.astype(str),"%y%m%d%H"))
        newdata['dayoftheweek'] = newdata['hour'].map(lambda x:  x.weekday)
        newdata['day'] = newdata['hour'].map(lambda x:  x.day)
        newdata['hour'] = newdata['hour'].map(lambda x:  x.hour)
        newdata = newdata.drop('hour',axis=1)
        newdata = newdata.astype(str)
        del data
        X_dict = np.asarray(newdata)

        self.X_train = preproc.fit_transform(X_dict)

        return self.X_train
Ejemplo n.º 2
0
class MachineLearning(object):
    def __init__(self):
        # Initialize classifier and vectorizer
        self.clf = Pipeline([('tfidf', TfidfVectorizer(min_df=1, ngram_range=(1, 2))),
                             ('clf', MultinomialNB(alpha=.01)),
                            ])

    def init_training(self):
        self.x_train = []
        self.y_train = []

    def add_training_data(self, data, label):
        self.x_train.append(data)
        self.y_train.append(label)

    # Train classifier
    # Can also use grid search to optimize accuracy, like
    '''
    parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'clf__alpha': (.01, .001),
    }
    gs_clf = GridSearchCV(clf, parameters, n_jobs=-1)
    '''
    def train(self):
        self.clf.fit(self.x_train, self.y_train)

    # Predict result
    # We can roughly estimate the accuracy using cross validation, like
    '''
    result = clf.predict(test_dc + test_marvel)
    baseline = [0 for x in range(len(test_dc))] + [1 for x in range(len(test_marvel))]
    print np.sum(result == baseline) / float(len(result))
    '''
    def predict(self, data):
        return self.clf.predict([data])[0]
Ejemplo n.º 3
0
    def KFOLDTEST(self, text, sent):
        k_fold = KFold(n=len(text), n_folds=6)

        pipeline = Pipeline(
            [
                ("vectorizer", CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)),
                ("tfidf", TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)),
                ("classifier", OneVsOneClassifier(LinearSVC())),
            ]
        )

        scores = []
        for train_indices, test_indices in k_fold:
            # print('Train: %s | test: %s' % (train_indices, test_indices))
            train_text = text[train_indices]
            train_y = sent[train_indices]

            test_text = text[test_indices]
            test_y = sent[test_indices]

            pipeline.fit(train_text, train_y)
            score = pipeline.score(test_text, test_y)
            scores.append(score)

        score = sum(scores) / len(scores)
        print ("scores ", scores, " Score ", score)
        return score
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring):
    '''
    Run normal SVM classification without cross-fold validation.
    '''

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation

    # feature selection since we have a small sample space
    fs = SelectPercentile(scoring, percentile=20)

    pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)])

    pipeline = OneVsRestClassifier(pipeline)

    clfer = pipeline.fit(x_train, y_train)
    y_predict_train = clfer.predict(x_train)

    print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train)

    y_predict_test = clfer.predict(x_test)
    print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test)

    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)

    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
 def train_clf(self):
     pipeline = Pipeline([
         ("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
         ("svc", LinearSVC(C=100))
     ])
     pipeline.fit(self.dataset.data, self.dataset.target)
     return pipeline
Ejemplo n.º 6
0
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
Ejemplo n.º 7
0
def test_set_pipeline_step_none():
    # Test setting Pipeline steps to None
    X = np.array([[1]])
    y = np.array([1])
    mult2 = Mult(mult=2)
    mult3 = Mult(mult=3)
    mult5 = Mult(mult=5)

    def make():
        return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])

    pipeline = make()

    exp = 2 * 3 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline.set_params(m3=None)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_dict_equal(
        pipeline.get_params(deep=True),
        {"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5},
    )

    pipeline.set_params(m2=None)
    exp = 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    # for other methods, ensure no AttributeErrors on None:
    other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"]
    for method in other_methods:
        getattr(pipeline, method)(X)

    pipeline.set_params(m2=mult2)
    exp = 2 * 5
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))

    pipeline = make()
    pipeline.set_params(last=None)
    # mult2 and mult3 are active
    exp = 6
    assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
    assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict")

    # Check None step at construction time
    exp = 2 * 5
    pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)])
    assert_array_equal([[exp]], pipeline.fit_transform(X, y))
    assert_array_equal([exp], pipeline.fit(X).predict(X))
    assert_array_equal(X, pipeline.inverse_transform([[exp]]))
Ejemplo n.º 8
0
def runCrossValidationTest(classifier_name,
        classifier_args=None,
        ngram=2,
        folds=5):

  if classifier_args is None:
    classifier_args = {}
  classifier = valid_classifiers[classifier_name](**classifier_args)
  X, y = load_non_preprocessed_data()
  # confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]])
  ml_pipeline = Pipeline([
                      ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))),
                      ('Classifier', classifier),
                      ])
  X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size = 0.25, random_state=0)
  ml_pipeline.fit(X_train, y_train)
  predictions = ml_pipeline.predict(X_test)
  confusion = confusion_matrix(y_test, predictions)
  f1 = f1_score(y_test, predictions, pos_label=None, average = 'micro')
  precision = precision_score(y_test, predictions, pos_label=None, average = 'micro')
  recall = recall_score(y_test, predictions, pos_label=None, average = 'micro')
  print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
  print("F1 score: " + str(f1))
  print("precision score: " + str(precision)) 
  print("recall score: " + str(recall)) 
  print(confusion)
  numpy.savetxt("data/test_results_confusion_matrix_" + classifier_name+".csv", confusion, delimiter=",")
  return ((f1, precision, recall))
Ejemplo n.º 9
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
Ejemplo n.º 10
0
def test():
    target_label = [u'weather', u'audio',u'pic',u'calculate',u'music', u'poem']
    training_text_raw = []
    training_label = []
    with open ('./training_source.csv','r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            if len(line) > 1 and line[1] in target_label:
                training_text_raw.append(unicode(line[0],"utf-8"))
                training_label.append(line[1])
        print training_label

        training_text = []
    for text in training_text_raw:
        seg_text = seg(text)
        training_text.append(seg_text)
    text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),

                     ('clf', MultinomialNB()),
])

    scores = cross_validation.cross_val_score(text_clf, training_text, training_label, cv=8)
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    text_clf.fit(training_text, training_label)

    while True:
        k_text = raw_input("\nPlease input:")
        if k_text == "exit":
            break
        print text_clf.predict([seg(unicode(k_text,'utf-8'))])
def main():
    corpus = capitalCorpus()
    transformer = textTransformer()

    continents = np.array(os.listdir('txt/'))

    for continent_dir in enumerate(continents):
        corpus = getText(continent_dir,corpus,transformer)


    #Split corpus into training set and test set
    train_X, test_X, train_Y, test_Y = train_test_split(corpus.data,
                                                            corpus.target, test_size = 0.25, random_state=54321)

    #Build a pipeline
    clf = MultinomialNB()
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer(use_idf = True)

    clf_pipe = Pipeline(
        [
            ('vectorizer', count_vect),
            ('tfidf', tfidf_transformer),
            ('classifier', clf)
        ]
    ).fit(train_X, train_Y)

    predicted = clf_pipe.predict(test_X)

    print(classification_report(test_Y, predicted))
    def clasificador(self,X_train, y_train, X_test, target_names, y_test,all_labels):
        
        lb = preprocessing.MultiLabelBinarizer()
        Y = lb.fit_transform(y_train)
        
        classifier = Pipeline([
            ('vectorizer',CountVectorizer(strip_accents='unicode')),
            ('tfidf',TfidfTransformer()),
            ('to_dense', DenseTransformer()),
            ('clf',OneVsRestClassifier(GaussianNB()))])
            


     
        classifier.fit(X_train,Y)
        
        predicted = classifier.predict(X_test)


        etiquetas = lb.inverse_transform(predicted)

                
        for i in range(0,len(etiquetas)):
            etiquetas[i]=list(etiquetas[i])

        
        valoresMacro = self.macro(etiquetas,y_test)
        valoresMicro = self.micro(etiquetas, y_test)        
Ejemplo n.º 13
0
def svcDictVector():
    recipeData = getRecipeData()
    
    labels = [recipe['cuisine'] for recipe in recipeData]
    ingredientsFixtures = [sorted(set(e['ingredients'])) for e in recipeData]
    for i, w in enumerate(ingredientsFixtures):
        ingredientsFixtures[i] = dict(zip(w, [1] * len(w)))        
                
    pipeline = Pipeline([
        ('dict', DictVectorizer()),
        ('variance', VarianceThreshold()),        
        ('tfidf', TfidfTransformer()),
        ('bayes', svm.LinearSVC()),
    ])    
    
    pipeline.fit(ingredientsFixtures, labels)
    print pipeline
    
    testRecipes = getTestData()    
    testIngredientsFixtures = [sorted(set(e['ingredients'])) for e in testRecipes]
    for i, w in enumerate(testIngredientsFixtures):
        testIngredientsFixtures[i] = dict(zip(w, [1] * len(w)))
        
    predictions = pipeline.predict(testIngredientsFixtures)    
    outputPercentCorrect(predictions)     
    copyAndOutput(predictions, testRecipes)
Ejemplo n.º 14
0
def train(param_search=False):
    data = load_files(download())
    y = [data.target_names[t] for t in data.target]

    # The random state on the LR estimator is fixed to the most arbitrary value
    # that I could come up with. It is biased toward the middle number keys on
    # my keyboard.
    clf = Pipeline([('tfidf', TfidfVectorizer(min_df=2, dtype=float,
                                              sublinear_tf=True,
                                              ngram_range=(1, 2),
                                              strip_accents='unicode')),
                    ('lr', LogisticRegression(random_state=623, C=5000))])

    if param_search:
        params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'lr__C': [1000, 5000, 10000]}

        print("Starting parameter search for review sentiment classification")
        # We ignore the original folds in the data, preferring a simple 5-fold
        # CV instead; this is intended to get a working model, not results for
        # publication.
        gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
        gs.fit(data.data, y)

        print("Parameters found:")
        pprint(gs.best_params_)
        print("Cross-validation accuracy: %.3f" % gs.best_score_)

        return gs.best_estimator_

    else:
        print("Training logistic regression for movie review polarity")
        return clf.fit(data.data, y)
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.items():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline
    def predictfactors(self):

        pipeline = Pipeline([("imputer", Imputer(strategy='mean', axis=0)),
                            ("logistic", LogisticRegression())
                             ])

        predict = pipeline.fit(self.X_train, self.y_train).predict(self.X_test)

        firstfactor=[]
        secondfactor=[]
        thirdfactor=[]

        res = np.array(pipeline.named_steps['logistic'].coef_ * self.X_test.iloc[[0]])
        threemaxindexes = np.array((-res).argsort().ravel())
        #[3 1 5 4 2 6 0]
        print(res)
        print(threemaxindexes)
        print(self.names)

        #sys.exit()
        for i in range(0,len(self.X_test)):
            res = np.array(pipeline.named_steps['logistic'].coef_ * self.X_test.iloc[[i]])

            threemaxindexes = np.array((-res).argsort().ravel())

            firstfactor.append(self.names[threemaxindexes[0]])
            secondfactor.append(self.names[threemaxindexes[1]])
            thirdfactor.append(self.names[threemaxindexes[2]])

        for i in range(0,len(self.X_test)):
            print([self.df[self.idcol][i], predict[i], firstfactor[i], secondfactor[i], thirdfactor[i]])
def train_optimal_classifier(clf, X, y, params, scale=False, folds=1000):
    pipeline = 0

    combined_features = FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])

    if scale:
        pipeline = Pipeline([("minmax", MinMaxScaler()),
                             ("features", combined_features), ("clf", clf)])
    else:
        pipeline = Pipeline([("features", combined_features), ("clf", clf)])

    param_grid = dict(features__pca__n_components=[0,1,3,6,9,12,15],
            features__univ_select__k=list(range(0, len(X[0]))))

    for k, v in params.iteritems():
        param_grid["clf__" + k] = v

    grid_search = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=cross_validation.StratifiedShuffleSplit(y, folds),
        verbose=1,
        scoring='f1',
        error_score=0,
        refit=True,
        )
    grid_search.fit(X, y)
    return (grid_search.best_estimator_, grid_search.best_score_, pipeline.fit(X,y))
Ejemplo n.º 18
0
class Model10(Model):
  def __init__(self):
    pass
  def fit(self, Xmask, y):
    pr = prepare.Prepare_0(model=10, preproc=1, min_df=1, use_svd=False, tfidf=2,
        stemmer=0)
    (X_all_df,_,BP,params) = pr.load_transform(update=False)
    names = list(X_all_df.columns)
    X_all = np.asarray(X_all_df)
    self.X_all, self.names = X_all, names

    clf0 = GaussianNB()
    clf1 = MultinomialNB(alpha=0.8)
    clf2 = BernoulliNB(alpha=1, binarize=0.01)

    clf = clf1
    self.rd = Pipeline([
        ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)),
        #("scaler",StandardScaler(with_mean=False)), 
        ("est", clf)
        ])

    self.rd.fit(Xmask,np.asarray(y))
    return self
  def predict_proba(self, Xmask):
    return self.rd.predict_proba(Xmask)
  def predict(self, Xmask):
    return self.rd.predict(Xmask)
  
  def starter(self):
    print "Model10 starter"
    self.fit(np.arange(100),np.arange(100))
Ejemplo n.º 19
0
class Classifier:
    def __init__(self, clf, scaler=None, selector=False):
        if scaler:
            if selector:
                self.clf = Pipeline([
                    ('scaler', scaler),
                    ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)),
                    ('classifier', clf)
                ])
            else:
                self.clf = Pipeline([
                    ('scaler', scaler),
                    ('classifier', clf)
                ])
        else:
            if selector:
                self.clf = Pipeline([
                    ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)),
                    ('classifier', clf)
                ])
            else:
                self.clf = clf

    def __str__(self):
        if isinstance(self.clf, Pipeline):
            return ', '.join(type(v).__name__ for k, v in self.clf.steps)
        return type(self.clf).__name__

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
Ejemplo n.º 20
0
    def op_machine_predict(self):
        """
        与machine_predict的区别在于,op_machine_predict在每个window中都通过grid_search
        的方法确定最后的参数。该模型的训练及预测步骤如下:
        对于每一个窗口的数据
        1) 对输入的ta_factors进行标准化的处理
        2) Feature selection:方法可选择
        3) PCA降维
        4) 训练并Grid_Search
        """
        ta_factors, labels = self.set_factors_labels()
        svc = SVC(kernel='linear')
        min_max_scaler = preprocessing.MinMaxScaler()
        pre = pd.DataFrame(index=ta_factors.index[self.window_size:], columns=['pre_label', 'pre_actual'])
        Cs = range(10, 100, 10)
        gammas = range(5, 100, 5)
        n_s = self.window_size
        for num in range(0, len(ta_factors)-n_s):
            ta_factors_scaled = min_max_scaler.fit_transform(ta_factors.ix[num:num+n_s+1])
            x_train = ta_factors_scaled[:-1]
            x_test = ta_factors_scaled[-1:]
            y_train = labels[num:num+n_s]
            y_test = labels[num+n_s]
            # ta_factors_scaled_pca = pca.fit_transform(ta_factors_scaled)
            rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 2))
            clf = Pipeline([('feature_select', rfecv), ('svm', SVC())])
            # estimator = GridSearchCV(clf, dict(svm__C=Cs, svm__gamma=gammas))
            pre_model = clf.fit(x_train, y_train)
            pre['pre_label'][num] = pre_model.predict(x_test).item()
            pre['pre_actual'][num] = y_test

        pre['pre_acu'] = pre['pre_label'] == pre['pre_actual']
        self.prediction_results = pre

        return pre
Ejemplo n.º 21
0
def train_polynomialRegressionModel(X, y, degree=2, interaction_only=False, include_bias=True):
    """
    Train a polynomial model using Linear Regression Pipeline with degrees
    """
    model = Pipeline([("poly", PolynomialFeatures(degree=degree)), ("linear", LinearRegression(fit_intercept=False))])
    model = model.fit(X, y)
    return model
Ejemplo n.º 22
0
def train_regressor(data, X_columns, y_show=y_init+y_curr):
    X = data.loc[:,X_columns]
    ys = data.loc[:, [i for i in y_show if i not in X_columns]]
    
    print()
    for n_trees in [256]:
    #list(range(4, 16)) + [18,20] + [2**n for n in range(4, 12)]:
    #[n for n in range(4, 64)]:#[2**n for n in range(1, 12)]:
        forest = Pipeline(steps=[
            ('forest', ExtraTreesRegressor(
                #RandomForestRegressor(
                n_estimators=n_trees, 
                n_jobs=min(n_trees, 62),
                oob_score=True, bootstrap=True))])
        start = time()
        forest.fit(X, ys)#new_ys)
        end = time()
        print(n_trees, forest.steps[0][1].oob_score_, end-start)
    
    print()
    print("%.5g seconds to train regressor" % (end-start))
    print()
    
    y_names = ys.columns
    X_names = X.columns
    return [forest, y_names, X_names]
def classify(text, label):
    #~ Testing purpose: 10-fold cross validation
    cv = KFold(n = len(label), n_folds = 10)
    n_c = [100, 200, 500, 1000, 2000, 5000, 10000]

    for i in n_c:
        clf = Pipeline([
                ('vect',
                        TfidfVectorizer(
                                analyzer='word',
                                ngram_range=(1, 1),
                                stop_words = 'english',
                                lowercase=True,
                                token_pattern=r'\b\w+\b',
                                tokenizer=tokenize_doc,
                                min_df = 1)),
                ('dim_reduction',
                        TruncatedSVD(n_components=i)),
                #~ ('feature_selection',
                        #~ SelectKBest(
                                #~ chi2,
                                #~ k=35)),
                ('classification',
                        LogisticRegression())
                        #~ SVC(kernel = 'linear'))
        ])
    
        print "len(label) ", len(label), " | text ", len(text)
        print ""
    
        clf.fit(np.asarray(text), np.asarray(label))
    
        cv_score = cross_val_score(clf, text, label, cv = cv, verbose = 1)
        print "Log Reg | n_c = ", i
        print "Accuracy List ", cv_score, " | Avg Accuracy ", np.mean(cv_score)
Ejemplo n.º 24
0
class Vectorizer():
    def __init__(self, hash=False, min_df=0.015, max_df=0.9):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        if hash:
            args = [
                ('vectorizer', HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer())),
                ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
                ('feature_reducer', TruncatedSVD(n_components=400)),
                ('normalizer', Normalizer(copy=False))
            ]
        else:
            args = [
                ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=min_df, max_df=max_df)),
                ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
                ('normalizer', Normalizer(copy=False))
            ]

        self.pipeline = Pipeline(args)

    def vectorize(self, docs, train=False):
        if train:
            return self.pipeline.fit_transform(docs)
        else:
            return self.pipeline.transform(docs)

    @property
    def vocabulary(self):
        return self.pipeline.named_steps['vectorizer'].get_feature_names()
Ejemplo n.º 25
0
def train(docs):
    """
    Trains and serializes (pickles) a vectorizing pipeline
    based on training data.

    `min_df` is set to filter out extremely rare words,
    since we don't want those to dominate the distance metric.

    `max_df` is set to filter out extremely common words,
    since they don't convey much information.
    """
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=0.015, max_df=0.9)),
        ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
        ('feature_reducer', TruncatedSVD(n_components=100)),
        ('normalizer', Normalizer(copy=False))
    ])

    print('Training on {0} docs...'.format(len(docs)))
    pipeline.fit(docs)

    PIPELINE = pipeline

    print('Serializing pipeline to {0}'.format(PIPELINE_PATH))
    pipeline_file = open(PIPELINE_PATH, 'wb')
    pickle.dump(pipeline, pipeline_file)
    print('Training complete.')
Ejemplo n.º 26
0
 def test_multiple_cols(self):
     t = bt.Split_transform(input_features=["a","b"],output_feature="res")
     df = pd.DataFrame.from_dict([{"a":"a b","b":"c d","c":3},{"a":"word1","b":"word2"}])
     transformers = [("split_transform",t)]
     p = Pipeline(transformers)
     df2 = p.transform(df)
     self.assertTrue(len(df2["res"][0]) == 4)
Ejemplo n.º 27
0
 def test_multiple_cols_numbers_ignored(self):
     t = bt.Split_transform(input_features=["a","b"],ignore_numbers=True,output_feature="res")
     df = pd.DataFrame.from_dict([{"a":"a b","b":"c 1","c":3}])
     transformers = [("split_transform",t)]
     p = Pipeline(transformers)
     df2 = p.transform(df)
     self.assertTrue(len(df2["res"][0]) == 3)
class ModelPipeline(object):

    def __init__(self, clf):

        self.columns =[]

        self.pipeline = Pipeline([
            ('clf', clf)
            ])



    def fit(self, X_train, y_train):
        self.pipeline.fit(X_train, y_train)
        self.columns = list(X_train.columns)

    def predict(self, X_test):
        return self.pipeline.predict(X_test)


    def feat_importances(self, n=10, string=True):

        imp = self.pipeline.steps[0][1].feature_importances_
        if string:
            return ''.join('%s: %s%%\n' % (self.columns[feat], round(
                imp[feat] * 100, 3)) for feat in np.argsort(imp)[-1:-(n+1):-1])
        else:
            return self.columns[np.argsort(imp)[-1:-(n+1):-1]], \
                sorted(imp)[-1:-(n+1):-1]

    def grid_search(self, X, y):

        parameters = {
            'clf__n_estimators': [100, 200, 300] ,
            'clf__max_features': ['sqrt', 50, 80],
            'clf__max_depth': [None, 50, 100],
            'clf__oob_score': [False, True],
            'clf__random_state':[29],
            'clf__class_weight':['balanced', None, 'balanced_subsample'],
            'clf__min_samples_split': [2, 10, 20]
        }


        grid_search = GridSearchCV(self.pipeline, parameters, n_jobs=-1, verbose=1, scoring = "recall")

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in self.pipeline.steps])
        print("parameters:")
        pprint(parameters)
        t0 = time()
        grid_search.fit(X, y)
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))
        return best_parameters
Ejemplo n.º 29
0
 def test_sklearn_pipeline(self):
     df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}])
     t = bt.Exclude_features_transform(excluded=["b"])
     transformers = [("exclude_transform",t)]
     p = Pipeline(transformers)
     df2 = p.fit_transform(df)
     self.assertEquals(len(df2.columns),1)
Ejemplo n.º 30
0
def predict():

    pipeline = Pipeline([
        ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
        ('neural network', Classifier(layers=[Layer("ExpLin", units=5), Layer("Softmax")], n_iter=25))])

    X = np.load('All_features.npz')['arr_0']

    D = np.load('Akunin_features.npz')['arr_0']

    all_samples = [1]*141 + [0]*123
    y = np.array(all_samples)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=0)

    pipeline.fit(X_train, y_train)
    pickle.dump(pipeline, open('NeuralNet_model.pkl', 'wb'))
    prediction = pipeline.predict(D)
    probs = pipeline.predict_proba(D)

    gradation = {1.01: 5, 0.9: 4, 0.8: 3, 0.7: 2, 0.6: 1}
    ress1 = []
    simple_predicts = []
    scale_predicts = []
    for i in prediction:
        simple_predicts.append(i[0])
    for i in probs:
        scale_predicts.append(i[1]*10)
        compare = []
        for u in gradation:
            if i[1] < u:
                compare.append(gradation[u])
        ress1.append(min(compare))

    return simple_predicts, scale_predicts
def model_pipeline(X, Y, model_params):
    if (model_params["name"] == 'logistic'):
        if model_params["bin"]:
            pclf = Pipeline([
                ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"],
                                     ngram_range=model_params['ngram'], binary=model_params['bin'])),
                ('norm', Normalizer()),
                ('clf', LogisticRegression(solver='lbfgs')),
            ])
        else:
            pclf = Pipeline([
                ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"],
                                         ngram_range=model_params['ngram'], binary=model_params['bin'])),
                ('tfidf', TfidfTransformer()),
                ('norm', Normalizer()),
                ('clf', LogisticRegression(solver='lbfgs')),
            ])
    elif (model_params["name"] == 'tree'):
        pclf = Pipeline([
            ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"],
                                     ngram_range=model_params['ngram'], binary=model_params['bin'])),
            ('tfidf', TfidfTransformer()),
            ('norm', Normalizer()),
            ('clf', tree.DecisionTreeClassifier(max_depth=model_params['depth'])),
        ])
    elif((model_params["name"] == 'svm')) :
        pclf = Pipeline([
            ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'] , binary = model_params['bin'] )),
            ('tfidf', TfidfTransformer()),
            ('norm', Normalizer()),
            ('scaler', StandardScaler(with_mean=False)),
            ('clf', svm.SVC(max_iter=model_params["iter"], gamma="scale", probability=model_params["prob"])),
        ])
    elif ((model_params["name"] == 'gnb')):
        pclf = Pipeline([
            ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'] , binary = model_params['bin'] )),
            # ('tfidf', TfidfTransformer()),
            ('norm', Normalizer()),
            ('clf', MultinomialNB()),
            ])
    elif ((model_params["name"] == 'stacklog')):
        pclf = StackingLogistic(model_params)
    pclf.fit(X, Y)
    # print("xddqsdfd")
    return pclf
Ejemplo n.º 32
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn import metrics


filepath = unicode('20news-bydate-train','utf-8')
rawData = datasets.load_files(filepath,encoding="latin1")

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(rawData.data)

tfidf_transformer = TfidfTransformer()
tfid = tfidf_transformer.fit_transform(x_train_counts)

clf = MultinomialNB().fit(tfid,rawData.target)


test_clf = Pipeline([
	('vect',CountVectorizer()),
	('tfid',TfidfTransformer()),
	('clf',MultinomialNB()),
	])
test_clf.fit(rawData.data , rawData.target)
testData = datasets.load_files("20news-bydate-test",encoding="latin1")
predicted = test_clf.predict(testData.data)
result = metrics.classification_report(testData.target,predicted,target_names = testData.target_names)
print(result)

Ejemplo n.º 33
0
import views_utils.dbutils as dbutils
sys.path.insert(0, "../../../osa")
from osa.wrapper_sm import SMLogit

import osa.utils as osa

uname = "VIEWSADMIN"
prefix = "postgresql"
db = "views"
port = "5432"
hostname = "VIEWSHOST"
connectstring = dbutils.make_connectstring(prefix, db, uname, hostname, port)

rf_500 = RandomForestClassifier(n_estimators=500, n_jobs=10)
scaler = StandardScaler()
pipe_rf_500 = Pipeline([('scaler', scaler), ('rf', rf_500)])

output_schema = "landed_test"
output_table = "osa_pgm_acled_protest_eval_calib_pr"

models = [{
    "dir_pickles":
    "$SNIC_TMP/osa/pickles/osa_pgm_acled_protest_eval_calib_pr/pgm_acled_protest_eval_calib_logit_fullsample_pr",
    "estimator":
    SMLogit(),
    "features": [
        "l2_acled_dummy_pr", "l3_acled_dummy_pr", "l4_acled_dummy_pr",
        "l5_acled_dummy_pr", "l6_acled_dummy_pr", "l7_acled_dummy_pr",
        "l8_acled_dummy_pr", "l9_acled_dummy_pr", "l10_acled_dummy_pr",
        "l11_acled_dummy_pr", "l12_acled_dummy_pr", "q_1_1_l2_acled_dummy_pr",
        "q_1_1_l3_acled_dummy_pr", "l1_acled_dummy_pr",
    'SURF_medissim': SURF(learned_metric_func=produce_learned_metric_func),
    'SURF': SURF(),
}

# Initialize dictionary for storing results.
res_dict = dict.fromkeys(rbas.keys())
for key in res_dict.keys():
    res_dict[key] = np.empty(NUM_FEATURES_TO_SELECT_LIM, dtype=np.float)

# Go over RBAs.
for rba_name in rbas.keys():

    print("### Testing {0} ###".format(rba_name))

    # Initialize next pipeline.
    clf_pipeline = Pipeline([('scaling', StandardScaler()),
                             ('rba', rbas[rba_name]), ('clf', clf)])

    # Go over values on x axis.
    for num_features_to_select in np.arange(1, NUM_FEATURES_TO_SELECT_LIM + 1):

        print("{0}/{1}".format(num_features_to_select,
                               NUM_FEATURES_TO_SELECT_LIM))

        # Set parameter.
        clf_pipeline.set_params(
            rba__n_features_to_select=num_features_to_select)

        # Compute score of 10 runs of 10 fold cross-validation.
        score = np.mean(
            cross_val_score(clf_pipeline,
                            data,
Ejemplo n.º 35
0
    def __init__(self, **kwargs):
        """
        set _node_transformer, _edge_transformer, tdifNodeTextVectorizer
        """
        FeatureDefinition.__init__(self)

        nbTypes = self._getTypeNumber(kwargs)

        block_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                (
                    "xywh",
                    Pipeline([
                        ('selector', NodeTransformerXYWH_v2()),
                        #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('xywh',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "neighbors",
                    Pipeline([
                        ('selector', NodeTransformerNeighbors()),
                        #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('neighbors',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ])),
                (
                    "1hot",
                    Pipeline([('1hot', Node1HotFeatures()
                               )  #does the 1-hot encoding directly
                              ]))
            ])
        grid_line_transformer = GridLine_NodeTransformer_v2()

        self._node_transformer = TransformerListByType(
            [block_transformer, grid_line_transformer])

        edge_BB_transformer = FeatureUnion(
            [  #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!!
                ("1hot",
                 Pipeline([('1hot',
                            Edge1HotFeatures(PageNumberSimpleSequenciality()))
                           ])),
                ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])),
                (
                    "numerical",
                    Pipeline([
                        ('selector', EdgeNumericalSelector()),
                        #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True))  #use in-place scaling
                        ('numerical',
                         QuantileTransformer(n_quantiles=self.n_QUANTILES,
                                             copy=False)
                         )  #use in-place scaling
                    ]))
            ])
        edge_BL_transformer = Block2GridLine_EdgeTransformer()
        edge_LL_transformer = GridLine2GridLine_EdgeTransformer()
        self._edge_transformer = TransformerListByType([
            edge_BB_transformer,
            edge_BL_transformer,
            edge_BL_transformer,  # useless but required
            edge_LL_transformer
        ])

        self.tfidfNodeTextVectorizer = None  #tdifNodeTextVectorizer
Ejemplo n.º 36
0
plt.scatter(x_train.T[0],
            x_train.T[1],
            c=y_train.ravel(),
            edgecolors='k',
            s=40,
            cmap=plt_dark)  # 全部数据
plt.xlabel(u'特征属性1', fontsize=15)
plt.ylabel(u'特征属性2', fontsize=15)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.grid(True)
plt.title(u'鸢尾花数据的决策树分类', fontsize=18)
plt.show()

#参数优化
pipe = Pipeline([('mms', MinMaxScaler()), ('skb', SelectKBest(chi2)),
                 ('pca', PCA()), ('decision', DecisionTreeClassifier())])

# 参数
parameters = {
    "skb__k": [1, 2, 3, 4],
    "pca__n_components": [0.5, 1.0],
    "decision__criterion": ["gini", "entropy"],
    "decision__max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
}
#数据
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
#模型构建
gscv = GridSearchCV(pipe, param_grid=parameters)
#模型训练
gscv.fit(x_train2, y_train2)
#算法的最优解
Ejemplo n.º 37
0
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA

import time

start_computing_time = time.time()

estimators = [("PCA", PCA()), ("nmz", Normalizer()), ("LSVC", LinearSVC())]

model = Pipeline(steps=estimators)

param_grid = {"LSVC__loss": ['hinge'], "LSVC__class_weight": ['balanced']}

Grid = GridSearchCV(model, param_grid=param_grid, cv=5)
Grid.fit(X_train, Y_train)

Best_Grid_estimator = Grid.best_estimator_
Best_Grid_estimator.fit(X_train, Y_train)

print(Best_Grid_estimator)

pred = Best_Grid_estimator.predict(X_test)

print("Accuracy of predictions:")
print(accuracy_score(Y_test, pred))
Ejemplo n.º 38
0
from sklearn.preprocessing import StandardScaler

from nlp4musa2020.dataloaders.alf200k import ALF200KLoader
from nlp4musa2020.dataloaders.alf200k import genre_target_labels
import nlp4musa2020.evaluators as evaluators

dataloader = ALF200KLoader(
    path='data/processed/dataset-lfm-genres.pickle',
    load_feature_groups=[
        'audio',
    ],
    text_vectorizers=None,
    target=genre_target_labels(),
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_jobs=-1)),
])

evaluator = GridEvaluator(
    parameters={
        'model__n_estimators': [10, 100, 300],
    },
    grid_parameters=evaluators.grid_parameters_genres(),
)

result_handlers = [
    result_handlers.print_gridsearch_results,
]
Ejemplo n.º 39
0
def train_and_fit_models(df_preprocessed, filename_input, param_grid=[], save_all=False, personal_note=""):
    """
    Method to train model(s) on a preprocessed dataset and return the gridsearch object.

    Parameters
    ----------
    df_preprocessed : pd.DataFrame()

    filename_input : str
    param_grid : list(dict(str:list()))
    save_all: Bool
        Boolean value to save DataFrame and settings
    personal_note : str
        String value to add to the filename to make recognition of the saved files easier.

    Returns
    -------
    Complete gridsearch object, where `gridsearchobject.best_estimator_` will give the best model.
    """

    # Delete rows where target value is NaN
    df = df_preprocessed.copy()
    df = drop_nan_from_specific_columns(df, settings.train['Y_VALUE'])

    # Make X and y dataset
    X = df.drop(settings.Y_TARGET_COLS, axis=1)
    y = df[settings.train['Y_VALUE']]
    # Split X and y into train and test dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=settings.train['TEST_SIZE'], random_state=settings.train['RANDOM_STATE'])

    # Construct basic pipeline for gridsearch
    pl_gs_total = Pipeline([('clf', LinearRegression())])  # Placeholder Estimator

    # Make/use param_grid for all classifiers and hyper parameters
    if len(param_grid) == 0:
        param_grid_total = [{'clf': [LinearRegression()],
                             'clf__normalize': settings.train['GRIDSEARCH_NORMALIZE'], },

                            {'clf': [Ridge()],
                             'clf__alpha': settings.train['GRIDSEARCH_ALPHA']},

                            {'clf': [Lasso()],
                             'clf__alpha': settings.train['GRIDSEARCH_ALPHA']},

                            {'clf': [KNeighborsRegressor()],
                             'clf__n_neighbors': settings.train['GRIDSEARCH_NEIGHBORS']},

                            {'clf': [XGBRegressor()],
                             'clf__gamma': settings.train['GRIDSEARCH_GAMMA'],
                             'clf__n_estimators': settings.train['GRIDSEARCH_N_ESTIMATORS']},
                            ]
    else:
        param_grid_total = param_grid

    # Initiate gridsearch object
    grid_search_total = GridSearchCV(pl_gs_total, param_grid_total, cv=settings.train['CROSS_VALIDATE'],
                                     scoring=settings.train['MODEL_SCORING'],
                                     return_train_score=True)

    # Fit gridsearch object on to the data
    grid_search_total.fit(X_train, y_train)
    # Get the best estimator (based on best train score)
    print(f"The model with the best train score is:\n{grid_search_total.best_estimator_['clf']}")
    # Calculate the RMSE for best estimator
    print(f"This model has a train score (RMSE) of: {rmse_from_neg_mean_squared_error(grid_search_total.best_score_)}")
    print(
        f"This model has a test score (RMSE) of: {rmse_from_gridsearch_best_estimator(grid_search_total, X_test, y_test)}")

    # Save
    if save_all:
        # Save the best estimator of the gridsearch in a Pickle file
        suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
        filename_output = f'best_model_{suffix_datetime}_{personal_note}'
        pickle.dump(grid_search_total.best_estimator_, open(f'{settings.DATAPATH}{filename_output}.pickle', 'wb'))

        # Save log of train step
        df_log = pd.DataFrame({"Model": [split_clf_and_params(grid_search_total.best_estimator_['clf'])[0]],
                              "Gridsearch_Params": [
                                  split_clf_and_params(grid_search_total.best_estimator_['clf'])[1]],
                              "Train_RMSE": [rmse_from_neg_mean_squared_error(grid_search_total.best_score_)],
                              "Test_RMSE": [rmse_from_gridsearch_best_estimator(grid_search_total, X_test, y_test)],
                              "Number_of_features": [len(X.columns)],
                              "Y_value": settings.train['Y_VALUE'],
                              "Input_filename": [filename_input],
                              "Output_filename": [filename_output],
                              })
        df_log.to_csv(settings.train['LOG_PATH'] + filename_output + '.csv', index=False,
                                     header=True)

    return grid_search_total
Ejemplo n.º 40
0
class SimpleModel:
    def __init__(self, train_dataset, val_dataset):
        tfidf = TfidfVectorizer(
            min_df=0.01,
            max_df=0.9,
            max_features=10000,
            # stop_words='english',
            use_idf=True,
            ngram_range=(1, 3),
            tokenizer=tokenize_text)

        self.model = Pipeline([("Vectorizer", tfidf),
                               ("model", XGBClassifier(seed=utils.RANDOM_SEED))
                               ])
        self.search_parameters = {
            "Vectorizer__min_df": [0, 0.05, 0.1],
            "Vectorizer__max_df": [0.9, 0.95, 1],
            "Vectorizer__max_features": [1000, 5000, 10000],
            "Vectorizer__ngram_range": [(1, 2), (1, 3), (1, 4)]
        }
        self.train_X = train_dataset["opinion"]
        self.train_y = train_dataset["outcome"]
        self.val_X = val_dataset["opinion"]
        self.val_y = val_dataset["outcome"]
        split = PredefinedSplit(
            np.concatenate(
                (np.repeat(-1,
                           len(self.train_y)), np.repeat(0, len(self.val_y)))))
        self.search = GridSearchCV(self.model,
                                   self.search_parameters,
                                   cv=split,
                                   scoring="f1",
                                   verbose=2,
                                   n_jobs=4)

    def fit(self):
        X = np.concatenate((self.train_X, self.val_X))
        y = np.concatenate((self.train_y, self.val_y))
        self.search.fit(X, y)
        self.model = self.search.best_estimator_
        return self.search.cv_results_

    def evaluate(self, dataset):
        return self.model.score(dataset["opinion"], dataset["outcome"])

    def load(self, result_dataset):
        params = result_dataset.sort_values("rank_test_score")
        params["param_Vectorizer__ngram_range"] = params["param_Vectorizer__ngram_range"]\
            .apply(lambda s: s.replace("(", "").replace(")", "").split(", "))\
            .apply(lambda q: (int(q[0]), int(q[1])))
        tfidf = TfidfVectorizer(
            min_df=params["param_Vectorizer__min_df"].values[0],
            max_df=params["param_Vectorizer__max_df"].values[0],
            max_features=params["param_Vectorizer__max_features"].values[0],
            # stop_words='english',
            use_idf=True,
            ngram_range=params["param_Vectorizer__ngram_range"].values[0],
            tokenizer=tokenize_text)
        self.model = Pipeline([("Vectorizer", tfidf),
                               ("model", XGBClassifier())])
        self.model.fit(self.train_X, self.train_y)

    def predict(self, documents):
        if type(documents) == str:
            return self.model.predict_proba([documents])
        return self.model.predict_proba(documents)

    def get_model_vocabulary(self):
        return self.model.named_steps["Vectorizer"].vocabulary_

    def get_model_max_ngrams(self):
        return self.model.get_params()["Vectorizer__ngram_range"][1]
Ejemplo n.º 41
0
    sentiments.append(int(elements[0]))

# SVM Solution
# Data_txt preproccessing - tokenization, selecting the best features
vectorizer = CountVectorizer(tokenizer=negation_handling.tokenize,
                             ngram_range=(1, 1),
                             max_df=0.5,
                             lowercase=False)
tfidfTans = TfidfTransformer(use_idf=True,
                             sublinear_tf=True,
                             smooth_idf=False,
                             norm='l2')

classifier = Pipeline([
    ('vect', vectorizer),
    ('tfidf', tfidfTans),
    ('feature_selection', SelectPercentile(chi2, percentile=93)),
    ('clf', LinearSVC(C=0.10000000000000001, multi_class='ovr')),
])

print "With negation handling: "
skf = cross_validation.StratifiedKFold(sentiments, n_folds=5)
scores = cross_validation.cross_val_score(classifier,
                                          sentences,
                                          sentiments,
                                          cv=skf,
                                          scoring='f1')

# print "Without negation handling: "
# vectorizer = CountVectorizer(tokenizer=None, ngram_range=(1, 1), max_df=0.5, lowercase=False)
# tfidfTans = TfidfTransformer(use_idf=True, sublinear_tf=True, smooth_idf=False, norm='l2')
#
Ejemplo n.º 42
0
                 header=None,
                 low_memory=False)
df, _ = prep_data('')
print(df.describe())

print('=== linear regression ===')
regr = linear_model.LinearRegression()
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())
regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)])
print('r2 = %.2f' % cross_val_score(
    regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean())
print('rmse = %.2f' %
      np.sqrt(-1 * cross_val_score(regr,
                                   df.iloc[:, :-1],
                                   df.iloc[:, -1:],
                                   cv=10,
                                   scoring='neg_mean_squared_error')).mean())
print('=== ridge ===')
regr = linear_model.Ridge(alpha=.05)
print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean())
print('=== lasso ===')
regr = linear_model.Lasso(alpha=.05)
print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean())
print('=== Poly Linear ===')
Ejemplo n.º 43
0
from sklearn.model_selection import cross_validate
import itertools

from sklearn.model_selection import GridSearchCV

# enter file name of data you want to analyze
filename = ''
full_dataset = pd.read_pickle("./" + str(filename) + " SVD.pkl")
print(full_dataset.head())

X = full_dataset.drop(['X', 'Y', 'Z', 'F'], axis=1)
y = full_dataset['F']

scaler = StandardScaler()
mlp = MLPRegressor(early_stopping=True)
pipeline = Pipeline([('transformer', scaler), ('estimator', mlp)])

parameters = {
    'estimator__learning_rate': ['constant'],
    'estimator__learning_rate_init': [0.005],
    'estimator__hidden_layer_sizes':
    [x for x in itertools.product((20, 50, 100, 150, 200), repeat=2)],
    'estimator__activation': ['tanh', 'relu', 'logistic'],
    'estimator__max_iter': [5000],
    'estimator__batch_size': [20, 50, 100, 150, 200]
}

clf = GridSearchCV(pipeline, parameters, cv=5)

clf.fit(X, y)
print("Best parameter (CV score=%0.3f):" % clf.best_score_)

X_train = corpus_train_tfidf_kpca 
X_test = corpus_test_tfidf_kpca
y_train = train_category
y_test = test_category

#Initialize K-Fold for cross validation
K = 5
kfold = KFold(n_splits=K, random_state=seed)

#Create Pipeline
estimators = []
estimators.append(('Normalizer', Normalizer()))
estimators.append(('knn_clf', KNeighborsClassifier()))
reg_knn_pipe1 = Pipeline(estimators)
reg_knn_pipe1.set_params(knn_clf__algorithm='ball_tree',knn_clf__weights='uniform')

#Create a grid search over n_neighbors values
parameters = {
        'knn_clf__n_neighbors' : np.arange(5,50)
}
estimator_knnreg = GridSearchCV(reg_knn_pipe1, parameters, cv=kfold)
                  
#Evaluate the grid search and print best regressor
print('Starting Grid Search')
estimator_knnreg.fit(X_train, y_train)

alphas = [x['knn_clf__n_neighbors'] for x in estimator_knnreg.cv_results_['params']]
means = [x for x in estimator_knnreg.cv_results_['mean_test_score']]
stds = [x for x in estimator_knnreg.cv_results_['std_test_score']]
Ejemplo n.º 45
0
                      column_params={'diag': {
                          'top_n': 200,
                          'min_support': 0
                      }})),
       ('imputer', Imputer(missing_values='NaN', strategy='median')),
       ('scaler', StandardScaler())]

model_stack = [
    fe1 + [('lr', LogisticRegression(class_weight="balanced"))], fe2 +
    [('rf', RandomForestClassifier(random_state=1, class_weight="balanced"))],
    fe2 +
    [('xgb',
      XGBClassifier(seed=1, scale_pos_weight=(1 / np.mean(ydata_train) - 1)))]
]

model_stack = [(m[-1][0], Pipeline(steps=m)) for m in model_stack]

# hyperparameter tuning for each model individually
ss = ShuffleSplit(n_splits=5, train_size=0.25, random_state=1)
tuning_constants = {
    'scoring': 'roc_auc',
    'cv': ss,
    'verbose': 1,
    'refit': False
}
grid_search_tuning_arg = tuning_constants.copy()
rand_search_tuning_arg = dict(tuning_constants, **{
    'random_state': 1,
    'n_iter': 20
})
tuning_types = {
    return stemmed


def tokenize(text):
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


#obtain stop words
stop_words = text.ENGLISH_STOP_WORDS

#define pipeline for tokenizing, feature extraction, feature selection, and softSVC
parameters = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
for x in range(0, 7):
    text_clf = Pipeline([('vect',
                          CountVectorizer(tokenizer=tokenize,
                                          stop_words=stop_words,
                                          analyzer='word')),
                         ('tfidf', TfidfTransformer()),
                         ('dimensionality_reduction',
                          TruncatedSVD(n_components=50, random_state=42)),
                         ('clf', SGDClassifier(alpha=parameters[x]))])
    scores = cross_validation.cross_val_score(text_clf,
                                              data,
                                              target,
                                              cv=5,
                                              scoring='f1_weighted')
    print scores
Ejemplo n.º 47
0
class AgeBucket(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X.Age // 15 * 15)


class RelativesOnboard(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X.SibSp + X.Parch)


column_transformer = ColumnTransformer([("Age_Bucket", AgeBucket(), ["Age"]), 
                                        ("Relatives_On_board", RelativesOnboard(), ["SibSp", "Parch"]), 
                                        ("one_hot_enc", OneHotEncoder(), ["Pclass","Sex","Embarked"])], 
                                        remainder='passthrough') 

preprocess_Pipeline = Pipeline([("col_trans", column_transformer), 
                                ("imputer", SimpleImputer(strategy="median"))])

X_train = preprocess_Pipeline.fit_transform(train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']])
y_train = train_data["Survived"]

for i in [SVC(gamma="auto"), RandomForestClassifier(n_estimators=100, random_state=42)]:
    i.fit(X_train, y_train)
    y_pred = i.predict(X_train)
    print(i, '\n', confusion_matrix(y_train, y_pred))
Ejemplo n.º 48
0
def construct_pipeline(selected_features, selected_classifier):
    feature_pipelines = construct_feature_pipelines(selected_features)
    return Pipeline([('features', FeatureUnion(feature_pipelines)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', classifier_dict[selected_classifier])])
Ejemplo n.º 49
0
masker = NiftiMasker(mask_img=imag_mask,
                     standardize=True,
                     memory="nilearn_cache",
                     memory_level=5)
X = masker.fit_transform(dataset)
# Apply our condition_mask
X = X[condition_mask]

# PREDICTION FUNCTION
from sklearn.svm import SVC
svc = SVC(kernel='linear', max_iter=1000)

# FEATURE SELECTION
feature_selection = SelectKBest(f_classif, k=500)

anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)])

anova_svc.fit(X, y)
#y_pred = anova_svc.predict(X)

##########################################################################

# NESTED CROSS VALIDATION
from sklearn.model_selection import GridSearchCV
k_range = [[15, 50, 150, 300], [500, 1000, 3000, 5000]]

#cv_scores = cross_val_score(anova_svc, X, conditions,)
# Print the results


def run_CV(params):
grid.fit(X_train_scaled, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Best set score: {:.2f}".format(grid.score(X_test_scaled, y_test)))
print("Best parameters: ", grid.best_params_)
# Best cross-validation accuracy: 0.98
# Best set score: 0.97
# Best parameters:  {'C': 1, 'gamma': 1}


# ----------------------------------------------------------------------------------------------
#                       One of the correct approaches using pipelines
# ----------------------------------------------------------------------------------------------

from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])
pipe.fit(X_train, y_train)
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))
# Test score: 0.95 - same as in the initial example

# ----------------------------------------------------------------------------------------------
#                   		Pipelines for grid searches
# ----------------------------------------------------------------------------------------------
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],
              'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
# dictionary keys contain of pipeline component name 'svm', double underscore '__', and parameter name 'C' and 'gamma'


grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
Ejemplo n.º 51
0
        value['proportion_pay_to_salary'] = (total_payments - salary) / salary
    # If either value is NaN, set proportion_payments_to_salary to NaN
    else:
        value['proportion_pay_to_salary'] = 'NaN'

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.


estimators = [('scaler', MinMaxScaler()), ('skb', SelectKBest()),
                ('gnb', GaussianNB())]
pipe = Pipeline(estimators)

param_grid = dict(skb__k=range(2,7))


### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using tester.py

cv = StratifiedShuffleSplit(labels, test_size=0.3, random_state = 42)

gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='f1_weighted')

gs.fit(features, labels)

clf = gs.best_estimator_
Ejemplo n.º 52
0
#%% Validation for Part2

dims1 = [2, 4, 5, 7, 10, 15, 20, 22, 26]

grid = {
    'pca__n_components': dims1,
    'NN__alpha': nn_reg,
    'NN__hidden_layer_sizes': nn_arch
}
pca = PCA(random_state=5)
mlp = MLPClassifier(solver='lbfgs',
                    activation='identity',
                    max_iter=2000,
                    early_stopping=True,
                    random_state=5)
pipe = Pipeline([('pca', pca), ('NN', mlp)])
gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

gs.fit(loans_X, loans_Y)
tmp = pd.DataFrame(gs.cv_results_)
tmp.to_csv(out + 'Loan_dim_red_ownNN.csv')

dims2 = [2, 3, 4, 5, 6, 7, 8, 9, 10]
#dims2 = [2,10]
grid = {'pca__n_components': dims2}
pca = PCA(random_state=5)
mlp = MLPClassifier(solver='lbfgs',
                    activation='logistic',
                    alpha=0.1,
                    hidden_layer_sizes=(50, ),
                    max_iter=2000,
print('=' * 80)
print("NearestCentroid (aka Rocchio classifier)")
results.append(benchmark(NearestCentroid()))

# Train sparse Naive Bayes classifiers
print('=' * 80)
print("Naive Bayes")
results.append(benchmark(MultinomialNB(alpha=.01)))
results.append(benchmark(BernoulliNB(alpha=.01)))

print('=' * 80)
print("LinearSVC with L1-based feature selection")
# The smaller C, the stronger the regularization.
# The more regularization, the more sparsity.
results.append(benchmark(Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False,
                                                  tol=1e-3))),
  ('classification', LinearSVC(penalty="l2"))])))

# make some plots

indices = np.arange(len(results))

results = [[x[i] for x in results] for i in range(4)]

clf_names, score, training_time, test_time = results
training_time = np.array(training_time) / np.max(training_time)
test_time = np.array(test_time) / np.max(test_time)

plt.figure(figsize=(12, 8))
plt.title("Score")
plt.barh(indices, score, .2, label="score", color='navy')
Ejemplo n.º 54
0
# 1. 构建一个管道流对象,定义数据处理的顺序
"""
Ridge参数说明: 
  alpha=1.0: ppt上的λ,正则化系数, 
  fit_intercept=True: 模型是否训练截距项,默认为训练(True), 
  normalize=False:在做模型训练之前,是否做归一化操作,一般不改动,
  max_iter=None:模型求解的迭代最大次数,默认表示不限制, 
  tol=1e-3: 模型收敛条件,当损失函数的变化值小于该值的时候,介绍迭代更新, 
  solver="auto":给定求解方式,
RidgeCV:
  alphas: 给定alpha的取值范围
  cv: 给定做几折交叉验证
"""
model = Pipeline(steps=[
    ('Poly', PolynomialFeatures()),  # 给定第一步操作,名称为Poly
    ('Linear', RidgeCV(alphas=[0.1, 0.2, 0.3], cv=5))  # 给定第二步操作,名称为Linear
])

# 1.2 Pipeline对象设置参数
# Poly__degree: Poly是定义Pipeline对象的时候给定的步骤名称,degree是对应步骤对象中的属性名称, 中间是两个连续的下划线
model.set_params(Poly__degree=2)
model.set_params(Linear__normalize=True)

# 2. 模型训练(先调用第一步进行数据处理,然后再调用第二步做模型训练)
# 假设我们的步骤是n步,那么前n-1步做的操作是: fit + transform, 最后一步做的操作是fit
model.fit(X_train, Y_train)
"""
model.fit等价于linear.fit(poly.fit_transform(x_train,y_train),y_train)
"""

print("多项式模型:{}".format(model.get_params()['Poly']))
Ejemplo n.º 55
0
# (e.g., country code, profession, species, etc.), then one-hot encoding will result in a
# large number of input features. This may slow down training and degrade performance.
#  If this happens, you will want to produce denser representations called embeddings,
# but this requires a good understanding of neural networks (see Chapter 14 for more details).
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

#As you can see, there are many data transformation steps
#  that need to be executed in the right order. Fortunately,
# Scikit-Learn provides the Pipeline class to help with such sequences
# of transformations. Here is a small pipeline for the numerical attributes:

#The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps
num_pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])
#When you call the pipeline’s fit() method, it calls fit_transform()
# sequentially on all transformers, passing the output of each call as
# the parameter to the next call, until it reaches the final estimator,
# for which it just calls the fit() method.

housing_num_tr = num_pipeline.fit_transform(housing_num)

num_attributes = list(housing_num)
cat_attributes = ["ocean_proximity"]
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attributes)),
    ('imputer', Imputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
Ejemplo n.º 56
0
class ModelTransformer(TransformerMixin):

    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict(X))
    
abstract_pipeline = Pipeline([
           ('extract_text', DataFrameGenericColumnExtractor('abstract'))
         , ('count_vec', HashingVectorizer(analyzer="word", stop_words='english', n_features=1000, binary=False))
         , ('tfidf_vec', TfidfTransformer() ) #TfidfVectorizer(analyzer="word", stop_words='english', max_df=0.5, max_features=1000, min_df=2, use_idf=True))
        ])

authors_pipeline = Pipeline([
           ('extract_text', DataFrameGenericColumnExtractor('keywords'))
         , ('count_vec', HashingVectorizer(analyzer="word", stop_words='english', binary=False))
         , ('tfidf_vec', TfidfTransformer() ) #TfidfVectorizer(analyzer="word", stop_words='english', max_df=0.5, max_features=1000, min_df=2, use_idf=True))
        ])

groups_pipeline = Pipeline([
           ('extract_text', DataFrameGenericColumnExtractor('groups'))
         , ('count_vec', HashingVectorizer(analyzer="word", stop_words='english', binary=False))
         , ('tfidf_vec', TfidfTransformer() ) #TfidfVectorizer(analyzer="word", stop_words='english', max_df=0.5, max_features=1000, min_df=2, use_idf=True))
        ])
Ejemplo n.º 57
0
    def common_test_model_tfidf_vectorizer_pipeline_cls(
            self, kind=None, verbose=False):
        if kind == 'stop':
            if ort_version.startswith('1.4'):
                # regression with stopwords in onnxruntime 1.4
                stopwords = ['theh']
            else:
                stopwords = ['the', 'and', 'is']
        else:
            stopwords = None
        X_train = numpy.array([
            "This is the first document",
            "This document is the second document.",
            "And this is the third one",
            "Is this the first document?",
        ]).reshape((4, 1))
        y_train = numpy.array([0, 1, 0, 1])

        if kind is None:
            model_pipeline = Pipeline([
                ('vectorizer',
                 TfidfVectorizer(stop_words=stopwords,
                                 lowercase=True,
                                 use_idf=True,
                                 ngram_range=(1, 3),
                                 max_features=30000)),
            ])
        elif kind == 'cls':
            model_pipeline = Pipeline([('vectorizer',
                                        TfidfVectorizer(stop_words=stopwords,
                                                        lowercase=True,
                                                        use_idf=True,
                                                        ngram_range=(1, 3),
                                                        max_features=30000)),
                                       ('feature_selector', SelectKBest(k=10)),
                                       ('classifier',
                                        SVC(class_weight='balanced',
                                            kernel='rbf',
                                            gamma='scale',
                                            probability=True))])
        elif kind == 'stop':
            model_pipeline = Pipeline([
                ('vectorizer',
                 CountVectorizer(stop_words=stopwords,
                                 lowercase=True,
                                 ngram_range=(1, 2),
                                 max_features=30000)),
            ])
        elif kind == 'reg':
            model_pipeline = Pipeline([('vectorizer',
                                        TfidfVectorizer(stop_words=stopwords,
                                                        lowercase=True,
                                                        use_idf=True,
                                                        ngram_range=(1, 3),
                                                        max_features=30000)),
                                       ('feature_selector', SelectKBest(k=10)),
                                       ('classifier',
                                        SVR(kernel='rbf', gamma='scale'))])
        else:
            raise AssertionError(kind)

        model_pipeline.fit(X_train.ravel(), y_train)
        initial_type = [('input', StringTensorType([None, 1]))]
        model_onnx = convert_sklearn(model_pipeline,
                                     "cv",
                                     initial_types=initial_type,
                                     options={SVC: {
                                         'zipmap': False
                                     }})

        if kind in (None, 'stop'):
            exp = [model_pipeline.transform(X_train.ravel()).toarray()]
        elif kind == 'cls':
            exp = [
                model_pipeline.predict(X_train.ravel()),
                model_pipeline.predict_proba(X_train.ravel())
            ]
        elif kind == 'reg':
            exp = [model_pipeline.predict(X_train.ravel()).reshape((-1, 1))]

        sess = InferenceSession(model_onnx.SerializeToString())
        got = sess.run(None, {'input': X_train})
        if verbose:
            voc = model_pipeline.steps[0][-1].vocabulary_
            voc = list(sorted([(v, k) for k, v in voc.items()]))
            for kv in voc:
                print(kv)
        for a, b in zip(exp, got):
            if verbose:
                print(stopwords)
                print(a)
                print(b)
            assert_almost_equal(a, b)
from sklearn.model_selection import GridSearchCV

df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data',
    header=None)
X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=1)

pipe_svc = Pipeline([('sc1', StandardScaler()), ('clf', SVC(random_state=1))])
param_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = [{
    'clf__C': param_range,
    'clf__kernel': ['linear']
}, {
    'clf__C': param_range,
    'clf__gamma': param_range,
    'clf__kernel': ['rbf']
}]

gs = GridSearchCV(estimator=pipe_svc,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10,
                  n_jobs=-1)
Ejemplo n.º 59
0
        splitters.append(convert_dataset.kfoldsplit(tagged_sents, k=k))

    splits = []
    for _ in range(k):
        train_sents = []
        eval_sents = []
        for splitter in splitters:
            train, eval = next(splitter)
            train_sents.extend(train)
            eval_sents.extend(eval)

        splits.append(
            transform_to_dataset(train_sents) +
            transform_to_dataset(eval_sents))

    clf = Pipeline([('vectorizer', DictVectorizer(sparse=False)),
                    ('classifier', LogisticRegression())])

    start = datetime.now()
    scores = np.array(
        Parallel(-1)(delayed(fit_and_score)(clf, *split) for split in splits))
    end = datetime.now()

    acc = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)
    print("Accuracy:", acc)

    timestamp = start.isoformat(" ", "seconds")
    commit_id = subprocess.run(
        "git rev-parse --short HEAD".split(" "),
        capture_output=True).stdout.decode("utf-8").strip()

    os.makedirs("results", exist_ok=True)
Ejemplo n.º 60
0

data_raw=pd.read_csv(r'C:\Users\Mohit\Desktop\web\ML\ML_Project_1\ML Project 1\housing_data.csv')
# data_raw.hist(bins=50, figsize=(20, 15))
# plt.show()

# print(data_raw.info())
#print(data_raw.head())

split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(data_raw,data_raw['CHAS']):
    strat_train_set=data_raw.loc[train_index]
    strat_test_set=data_raw.loc[test_index]

my_pipeline=Pipeline(
    [('imputer',SimpleImputer(strategy="median")),
     ('std_scaler',StandardScaler())]
)

strat_train_set_temp=strat_train_set.drop('MEDV',axis=1)

some_data=strat_train_set_temp.iloc[:5]
housing_tr=my_pipeline.fit_transform(strat_train_set_temp)
housing_labels=strat_train_set['MEDV'].copy()
some_labels=housing_labels.iloc[:5]

prepared_dt=my_pipeline.transform(some_data)
model=LinearRegression()

model.fit(housing_tr,housing_labels)
predicted_labels=model.predict(prepared_dt)
print(list(some_labels),predicted_labels)