def train_svm(x_train,
              y_train,
              x_valid,
              y_valid,
              loss,
              penalty,
              alpha,
              n_iter):

    full_svm_classifier = SGDClassifier(loss=loss,
                                        penalty=penalty,
                                        alpha=alpha,
                                        verbose=True,
                                        class_weight='balanced',
                                        n_iter=n_iter,
                                        learning_rate="optimal")
    full_svm_classifier.fit(x_train, y_train)

    cm = confusion_matrix(y_valid,full_svm_classifier.predict(x_valid))

    accuracy_negative = cm[0,0] / np.sum(cm[0,:])
    accuracy_positive = cm[1,1] / np.sum(cm[1,:])

    precision = cm[1,1] / (cm[1,1] + cm[0,1])
    recall = cm[1,1] / (cm[1,1] + cm[1,0])

    f1_score = 2 * precision * recall / (precision + recall)

    print(accuracy_positive,accuracy_negative,precision,recall,f1_score)

    return full_svm_classifier
Esempio n. 2
0
def test_underflow_or_overlow():
    with np.errstate(all="raise"):
        # Generate some weird data with hugely unscaled features
        rng = np.random.RandomState(0)
        n_samples = 100
        n_features = 10

        X = rng.normal(size=(n_samples, n_features))
        X[:, :2] *= 1e300
        assert_true(np.isfinite(X).all())

        # Use MinMaxScaler to scale the data without introducing a numerical
        # instability (computing the standard deviation naively is not possible
        # on this data)
        X_scaled = MinMaxScaler().fit_transform(X)
        assert_true(np.isfinite(X_scaled).all())

        # Define a ground truth on the scaled data
        ground_truth = rng.normal(size=n_features)
        y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32)
        assert_array_equal(np.unique(y), [0, 1])

        model = SGDClassifier(alpha=0.1, loss="squared_hinge", n_iter=500)

        # smoke test: model is stable on scaled data
        model.fit(X_scaled, y)
        assert_true(np.isfinite(model.coef_).all())

        # model is numerically unstable on unscaled data
        msg_regxp = (
            r"Floating-point under-/overflow occurred at epoch #.*"
            " Scaling input data with StandardScaler or MinMaxScaler"
            " might help."
        )
        assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
Esempio n. 3
0
def algo(a):
    global data
    global week 
    target = data['target']
    data = data[["id", "cpu", "creator", "dbs" , "dtype" , "era" ,  "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]]
    week['target'] = 0
    week['target'] = week.apply(convert, axis=1)
    week['target'] = week['target'].astype(int)
    test1 = week
    week = week[["id", "cpu", "creator", "dbs" , "dtype" , "era" ,  "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]]
    if a == 'rf':
        #RANDOM FOREST CLASSIFIER 
        rf = RandomForestClassifier(n_estimators=100)
        rf = rf.fit(data, target)
	predictions = rf.predict(week)
	cal_score("RANDOM FOREST", rf, predictions, test1['target'])
    if a == "sgd":
        #SGD CLASSIFIER     
        clf = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
            fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
            loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
            random_state=None, shuffle=True, verbose=0,
            warm_start=False)
        clf.fit(data, target)
        predictions = clf.predict(week)
	cal_score("SGD Regression",clf, predictions, test1['target'])
    if a == "nb":
	clf = GaussianNB()
	clf.fit(data, target)
	predictions = clf.predict(week)
	cal_score("NAIVE BAYES", clf, predictions, test1['target'])
def SGDC_SVM_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS):
    print("***************Starting SVM***************")
    t0 = time()
    clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-5, n_iter=100)
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_cv)
    score = clf.score(X_cv,Y_cv)

    print("{0:.2f}%".format(100 * score))
    Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds),
                      rownames=['actual'], colnames=['preds'])
    Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100
    print(Summary)

    #Check with log loss function
    epsilon = 1e-15
    #ll_output = log_loss_func(Y_cv, preds, epsilon)
    preds2 = clf.predict_proba(X_cv)
    ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True)
    print(ll_output2)

    print("done in %0.3fs" % (time() - t0))

    preds3 = clf.predict_proba(X_test)
    #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':]))
    preds4 = clf.predict_proba(Actual_DS)
    print("***************Ending SVM***************")
    return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')
Esempio n. 6
0
def validate():
  """
  Runs a 10-fold cross validation on the classifier, reporting
  accuracy.
  """
  trainDf = pd.read_csv("../NewData/train.csv")
  X = np.matrix(pd.DataFrame(trainDf, index=None,
    columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
    "user_pop", "frnd_infl", "evt_pop"]))
  y = np.array(trainDf.interested)
  nrows = len(trainDf)
  kfold = KFold(nrows, 10)
  avgAccuracy = 0
  run = 0
  for train, test in kfold:
    Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
    clf = SGDClassifier(loss="log", penalty="l2")
    clf.fit(Xtrain, ytrain)
    accuracy = 0
    ntest = len(ytest)
    for i in range(0, ntest):
      yt = clf.predict(Xtest[i, :])
      if yt == ytest[i]:
        accuracy += 1
    accuracy = accuracy / ntest
    print "accuracy (run %d): %f" % (run, accuracy)
    avgAccuracy += accuracy
    run += 1
  print "Average accuracy", (avgAccuracy / run)
def main():
    """ Generates features and fits classifier. """
    
    featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000)
    trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000)
    testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes)
    joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl"))
    trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl"))
    logging.info("Feature preparation done, fitting model...")
    clf = SGDClassifier(    loss="log", 
                            penalty="l2", 
                            alpha=1e-4, 
                            class_weight="auto")
    clf.fit(trainFeatures,trainTargets)

    logging.info("Predicting...")
    
    predicted_scores = clf.predict_proba(testFeatures).T[1]

    
    logging.info("Write results...")
    output_file = "avito_starter_solution.csv"
    logging.info("Writing submission to %s" % output_file)
    f = open(os.path.join(dataFolder,output_file), "w")
    f.write("id\n")
    
    for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
        f.write("%d\n" % (item_id))
    f.close()
    logging.info("Done.")
    def test_create_model(self):
        print("labeled sentence worked?")
        x_train = labelizeReviews(self.xTrain, 'TRAIN')
        x_test = labelizeReviews(self.xTest, 'TEST')
        model_dm = gensim.models.Doc2Vec(min_count=1, window=5, size=self.size, sample=1e-3, negative=5, workers=3)
        model_dbow = gensim.models.Doc2Vec(min_count=1, window=6, size=self.size, sample=1e-3, negative=5, dm=0, workers=3)
        sentences = x_train
        model_dm.build_vocab(sentences)
        model_dbow.build_vocab(sentences)
        # npArray = np.array(x_train)
        for epoch in range(10):
            print("Starting epoch:", str(epoch))
            # perm = np.random.permutation(npArray.shape[0])
            model_dm.train(random.sample(sentences, len(sentences)))
            model_dbow.train(random.sample(sentences, len(sentences)))
        # model_dm.train(x_train)
        train_vecs = getVecs(model_dm, x_train, self.size)
        train_vecs_dbow = getVecs(model_dbow, x_train, self.size)
        train_vecs_total = np.hstack((train_vecs, train_vecs_dbow))

        sentences = x_test
        for epoch in range(10):
            print("Starting epoch:", str(epoch))
            # perm = np.random.permutation(npArray.shape[0])
            model_dm.train(random.sample(sentences, len(sentences)))
            model_dbow.train(random.sample(sentences, len(sentences)))
        test_vecs = getVecs(model_dm, x_train, self.size)
        test_vecs_dbow = getVecs(model_dbow, x_train, self.size)
        test_vecs_total = np.hstack((test_vecs, test_vecs_dbow))
        lr = SGDClassifier(loss='log', penalty='l1')
        lr.fit(train_vecs_total, self.labelsTrain[:self.samples])

        print('Test Accuracy: %.2f'%lr.score(test_vecs_total, self.labelsTest[:self.samples]))
Esempio n. 9
0
def classify(dummy_train,dummy_test,feature_pkl,output_file):
    # Train classifier, iterating over subsets
    # Load Features
    print 'Loading features...'
    featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
    trainTargets = np.array(trainTargets)
    testItemIds = np.array(testItemIds)
    predicted_ids = []
    predicted_scores = []
    # SGD Logistic Regression per sample 
    clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
          eta0=0.0, fit_intercept=True, l1_ratio=0.15,
          learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
          penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
          verbose=0, warm_start=False)
    for col in range(np.shape(dummy_train)[1]):
        # Get nonzero dummy indices as array
        idx_train = dummy_train[:,col].astype('bool').T.toarray()[0]
        print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1])
        sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0)
        clf.fit(sub_train,trainTargets[idx_train])
       # Use probabilities instead of binary class prediction in order to generate a ranking    
        idx_test = dummy_test[:,col].astype('bool').T.toarray()[0]
        sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0)
        predicted_scores += clf.predict_proba(sub_test).T[1].tolist()
        predicted_ids += testItemIds[idx_test].tolist()
    
    with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid:
        out_fid.write("id\n")
        for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True):
           # only writes item_id per output spec, but may want to look at predicted_scores
            out_fid.write("%d\n" % (item_id))
Esempio n. 10
0
def do_classify():
    corpus = MyCorpus()
    # tfidf_model = TfidfModel(corpus)
    corpus_idf = tfidf_model[corpus]
    # corpus_lsi = lsi_model[corpus_idf]
    num_terms = len(corpus.dictionary)
    # num_terms = 400
    corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False)
    # print corpus_sparse.shape
    # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary))
    # print corpus_dense.shape
    penalty = "l2"
    clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True)
    # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)
    y = np.array(corpus.cls_y)
    # print y.shape
    clf.fit(corpus_sparse, y)
    filename = os.path.join(HERE, "sgdc_clf.pkl")
    _ = joblib.dump(clf, filename, compress=9)
    print "train completely"

    X_test = []
    X_label = []
    for obj in SogouCorpus.objects.filter(id__in=corpus.test_y):
        X_test.append(obj.tokens)
        X_label.append(cls_ids[obj.classify])
        # result = classifier.predict(obj.tokens)
    test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test]
    test_corpus = tfidf_model[test_corpus]
    test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False)
    pred = clf.predict(test_corpus)
    score = metrics.f1_score(X_label, pred)
    print ("f1-score:   %0.3f" % score)
Esempio n. 11
0
def classify_reviews():
	import featurizer
	import gen_training_data
	import numpy as np
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.linear_model import SGDClassifier

	data = gen_training_data.gen_data();
	stemmed_data = featurizer.stem(data);
	tfidf= featurizer.tfidf(data);
	clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']);
	predicted = clf.predict(tfidf['test_tfidf']);
	num_wrong = 0;
	tot = 0;
	for expected, guessed in zip(data['testing_labels'], predicted):
		if(expected-guessed != 0):	
			num_wrong += 1;

	print("num_wrong: %d",num_wrong)

	sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42);
	_ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']);
	sgd_pred = sgd_clf.predict(tfidf['test_tfidf']);
	print np.mean(sgd_pred == data['testing_labels']);

	stem_tfidf = featurizer.tfidf(stemmed_data);
	_ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']);
	sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']);
	print np.mean(sgd_stem_prd==data['testing_labels']);
def SGD(x, y):
#Using Stochastic Gradient Descent of Sklearn
	from sklearn.linear_model import SGDClassifier
	clf = SGDClassifier()
	clf.fit(x, y)

	return clf.predict(x)
def buildModel(size):
	with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile:
		pos_tweets =[]
		neg_tweets =[]
		spamreader = csv.reader(csvfile, delimiter=',')
		for row in spamreader:
			if row[1] == '1':
				if not (len(pos_tweets) > size):
					pos_tweets.append(_cleanTweet(row[3]))
			else:
				if not (len(neg_tweets) > size):
					neg_tweets.append(_cleanTweet(row[3]))
	y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size]))))
	x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2)
	x_train = _cleanText(x_train)
	x_test = _cleanText(x_test)
	n_dim = 100
	#Initialize model and build vocab
	imdb_w2v = Word2Vec(size=n_dim, min_count=10)
	imdb_w2v.build_vocab(x_train)
	imdb_w2v.train(x_train)
	train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train])
	train_vecs = scale(train_vecs)
	#Train word2vec on test tweets
	imdb_w2v.train(x_test)
	#Build test tweet vectors then scale
	test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test])
	test_vecs = scale(test_vecs)
	lr = SGDClassifier(loss='log', penalty='l1')
	lr.fit(train_vecs, y_train)
	imdb_w2v.save("imdb_w2v")
	f = open("Accuracy.txt","w")
	f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2))
	f.close()
def main(date):
    """
    Runs linear regression (classification) between the herbicide 
    resistance classes based on all wavelengths. The weights
    associated with each wavelength are then plotted, allowing
    the user to see the contribution to classification by each
    wavelength.

    :param date: (string) Data collection date YYYY_MMDD

    :return: (None)
    """
    
    # Load the training data from disk   
    X, y = FileIO.loadTrainingData(date)
    X = np.nan_to_num(X)

    # Train the classifier on the loaded data
    clf = SGDClassifier()
    clf.fit(X, y)

    # Plot the feature weights to visualize feature contributions
    featureWeights = np.fabs(clf.coef_)

    for i in xrange(3):
        plt.plot(WAVELENGTHS, featureWeights[i])
        plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others")
        plt.xlabel("Wavelength (nm)")
        plt.ylabel("Absolute Weight")
        plt.show()
Esempio n. 15
0
class twoclass(SGDClassifier):
    # THE HACK IS NOW GETTING EVEN MORE EVIL
    def __init__(self):
        self.clazz= SGDClassifier(loss='log')

    def fit(self,X,y, crossval=False):

        if crossval:
            print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean()

        self.clazz.fit(X,y)
        self.intercept_= self.clazz.intercept_
        self.classes_= self.clazz.classes_
        return self

    # eden cant annotate two classes if the esti is not a sgdregressor
    #  -> this hack is made!
    '''
    details: decission function returns a one d array.
    eden only accepts these if the estimater is instance of sgdregressor.
    so i make a two d array from my 1 d array.
    if i hack something like this in the future maybe the intercept array needs to be provided..
    (see the annotator code)
    '''

    # default guy:
    #def decision_function(self, vector):
    #    answer =  super(self.__class__,self).decision_function(vector)
    #    return np.vstack((answer, (answer-1))).T

    def decision_function(self,vector):
        return self.clazz.predict_proba(vector)

    '''
def runSGDPipeline(entries, langs):
	t0 = time()
	sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                            alpha=0.001, n_iter=5, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.fit_transform(X_train_counts)

	clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42)
	clf.fit(X_train_tfidf, langs)

	X_new_counts = vect.transform(entries)
	X_new_tfidf = tfidf.transform(X_new_counts)
	predicted = clf.predict(X_new_tfidf.toarray())

	print(np.mean(predicted == langs))
	print(metrics.classification_report(langs, predicted, target_names=langs))
	print(metrics.confusion_matrix(langs, predicted))
	print("Took %s seconds." % (time()-t0))
	print("n_samples: %d, n_features: %d" % X_train_tfidf.shape)
	return sgd_pipeline
Esempio n. 17
0
def sgc_test(X, y, weight):
    from sklearn.linear_model import SGDClassifier
    from sklearn import cross_validation
    from sklearn.metrics import confusion_matrix
    from sklearn.preprocessing import StandardScaler

    for i in range(0,1):
        X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split(
            X, y, weight, test_size=0.2, random_state=0)
        clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2")
        #clf = LogisticRegression( max_iter=100)

        scaler = StandardScaler(with_mean=False)
        scaler.fit(X_train)  # Don't cheat - fit only on training data
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)  # apply same transformation to test data

        clf.fit(X_train, y_train, sample_weight=weight_train)

        y_pred = clf.predict(X_train)
        #print(confusion_matrix(y_train, y_pred))
        print(clf.score(X_train,y_train,weight_train))

        y_pred = clf.predict(X_test)

        #print(confusion_matrix(y_test, y_pred))
        print(clf.score(X_test,y_test,weight_test))
Esempio n. 18
0
def crossvalidate(feas, labels, param):
    labels = np.array(list(labels), dtype=int)
    accs = []
    for train_ids, valid_ids in StratifiedKFold(labels, 10):
        idf=train_idf([feas[i] for i in train_ids])
        X,vocab=extract_feas(feas, idf)
        #lda=train_lda(X, vocab, num_topics)
        #X=transform_lda(X, lda)
        labels_train = labels[train_ids].copy()
        weights = balance_weights(labels_train, param['bg_weight'])
        labels_train[labels_train == 0] = 1
        model=SGDClassifier(loss='log',
                            alpha=param['regu']/len(labels_train),
                            fit_intercept=True,
                            shuffle=True, n_iter=50)
        model.fit(X[train_ids], labels_train, sample_weight=weights)
        pp = model.predict_proba(X[valid_ids])
        pred_labels = np.argmax(pp, 1)
        pred_labels = model.classes_[pred_labels]
        #a=accuracy(labels[valid_ids], pred_labels, 1)
        # return all scores for "good" class
        assert model.classes_[1] == 2
        pred_scores = pp[:,1]
        a=avg_precision(labels[valid_ids], pred_scores)
        print '%.2f' % a,
        accs.append(a)
    return np.mean(accs)
Esempio n. 19
0
def train_vectorized(feats, Y, model_path=None, grid=False):

    # Vectorize labels
    labels = [ labels_map[y] for y in Y ]
    Y = np.array( labels )

    # Vectorize feature dictionary
    vec = DictVectorizer()
    X = vec.fit_transform(feats)
    norm_mat( X , axis=0 , copy=False)

    # Grid Search
    if grid:
        print 'Performing Grid Search'
        clf = do_grid_search(X, Y)
    else:
        #clf = LinearSVC(C=0.1, class_weight='auto')
        #clf = LogisticRegression(C=0.1, class_weight='auto')
        clf = SGDClassifier(penalty='elasticnet',alpha=0.001, l1_ratio=0.85, n_iter=1000,class_weight='auto')
        clf.fit(X, Y)


    # Save model
    if model_path:
        with open(model_path+'.dict' , 'wb') as f:
            pickle.dump(vec, f)

        with open(model_path+'.model', 'wb') as f:
            pickle.dump(clf, f)


    # return model
    return vec, clf
Esempio n. 20
0
class kernelsvm():
    def __init__(self, theta0, alpha, loss_metric):
        self.theta0 = theta0
        self.alpha = alpha
        self.loss_metric = loss_metric
    def fit(self, X, y, idx_SR):
        n_SR = len(idx_SR)
        self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR)
        X_features = self.feature_map_nystroem.fit_transform(X,idx_SR)
        print("fitting SGD")
        self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha)
        self.clf.fit(X_features, y)
        print("fitting SGD finished")
    def predict(self, X):
        print("Predicting")
        X_transform = self.feature_map_nystroem.transform(X)
        return self.clf.predict(X_transform), X_transform
    def decision_function(self, X):
        # X should be the transformed input!
        return self.clf.decision_function(X)
    def err_rate(self, y_true, y_pred):
        acc = accuracy_score(y_true, y_pred)
        err_rate = 1.0-acc
        return err_rate
    def get_params(self):
        return self.clf.get_params()
Esempio n. 21
0
def train(docs, labels, regu=1, bg_weight=.1):
    '''
    :param docs: iterator of (title, body) pairs
    :param labels: integer labels for docs (0 is weakly-negative)
    :return: model
    '''
    num_topics=50
    feas = map(extract_words,  docs)
    labels = np.array(list(labels), dtype=int)
    idf=train_idf(feas)
    X,vocab=extract_feas(feas, idf)
    #lda=train_lda(X, vocab, num_topics)
    #X=transform_lda(X, lda)
    # set up sample weights
    weights = balance_weights(labels, bg_weight)
    labels=labels.copy()
    labels[labels == 0] = 1
    model=SGDClassifier(loss='log',
                        alpha=regu/len(labels),
                        fit_intercept=True,
                        n_iter=100,
                        shuffle=True)
    model.fit(X, labels, sample_weight=weights)
    #print accuracy(labels, model.predict(X))
    return dict(idf=idf, logreg=model, lda=None)
def stochasticGD(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans stochasticGD split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Stochastic Gradient Descent "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Stochastic_GD_metrics_test.txt"
    file = open(results, "w")
    file.write("Stochastic Gradient Descent estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Stochastic Gradient Descent %f"%test_size
    save = Output + "Stochastic_GD_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
Esempio n. 23
0
def predict_sgd(X_train, y_train, X_test, sample_weight):
    clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2,
                        n_iter=50)
    clf.fit(X_train, y_train, sample_weight=sample_weight)

    predictions = clf.predict_proba(X_test)
    return predictions
def train_and_predict_m3 (train, test, labels) :
    ## Apply basic concatenation + stemming
    trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter')

    """
    # Beautiful soup cleanup and stemming
    stemmer = PorterStemmer()
    trainData = modified_cleanup(train, stemmer, is_train = True)
    testData = modified_cleanup(test, stemmer, is_train = False)
    """
				
    ## TF-IDF transform with sub-linear TF and stop-word removal
    tfv = TfidfVectorizer(min_df = 3,  max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS)
    tfv.fit(trainData)
    X =  tfv.transform(trainData) 
    X_test = tfv.transform(testData)
    
    ## Create the classifier
    clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1)
    
    ## Create a parameter grid to search for best parameters for everything in the pipeline
    param_grid = {'n_iter' : [30, 50, 80, 100, 200],  'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] }
    
    ## Predict model with best parameters optimized for quadratic_weighted_kappa
    if (gridSearch) :
        model = perform_grid_search (clf, param_grid, X, labels)    	
        pred = model.predict(X_test)
    else :
        clf.fit(X, labels)    	
        pred = clf.predict(X_test)
    return pred
Esempio n. 25
0
def train_kaggle(dataset, alg="rig", data="bow"):
    train_x, train_y, test_x = dataset
    print "shape for training data is", train_x.shape

    if alg == "svm":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20)
    elif alg == "svm_sq":
        clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge")
    elif alg == "log":
        clf = LogisticRegression(verbose=1, n_jobs=2)
    elif alg == "per":
        clf = Perceptron(verbose=1, n_jobs=2, n_iter=25)
    elif alg == "rig":
        clf = RidgeClassifier()
    elif alg == "pa":
        clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25)
    else:
        raise NotImplementedError

    print "training with %s..." % alg

    clf.fit(train_x, train_y)
    # clf.fit(validate_x, validate_y)
    predicted = clf.predict(test_x)
    save_csv(predicted, fname=alg + "_" + data)

    if alg != "nb":
        return clf.decision_function(train_x), clf.decision_function(test_x)
    else:
        return clf.predict_proba(train_x), clf.predict_proba(test_x)
Esempio n. 26
0
class SGD(object):
	def __init__(self):
		self.sgd = SGDClassifier(loss='modified_huber', alpha = .00001, penalty='elasticnet',shuffle=True, n_jobs=-1,random_state = 2014)
	def predict(self, X):
		return self.sgd.predict_proba(X)[:,1][:,np.newaxis]
	def fit(self, X, y):
		self.sgd.fit(X,y)
def scikit_GDS(x,y, X_test,y_test=None, prevmodel="yes", output=False):
    from sklearn.linear_model import SGDClassifier
    from sklearn.externals import joblib

    clf = SGDClassifier(loss="hinge", penalty="l2")
    ##
    if prevmodel !="yes":
    	clf.fit(X, y)
    	joblib.dump(clf, 'trained_GDS_model.pkl') 
    else:
    	clf =joblib.load('trained_GDS_model.pkl')

    if output == False:
        predictions =  clf.predict(X_test)
        correctcount = 0
        totalcount = 0
        for index, each in enumerate(predictions):
        	if y_test[index] == each:
        		correctcount +=1
        	totalcount+=1

        print str(correctcount) +" / " + str(totalcount) +" = " + str(float(correctcount)/totalcount)
    else:
        predictions =  clf.predict(X_test)
        return predictions
Esempio n. 28
0
def sgd_classifier(V_train, y_train, V_val, y_val, V_test, y_test):

    t0 = time.time()

    print 'Building Random Forest model'

    clf = SGDClassifier(n_iter = 50)

    #clf = grid_search.GridSearchCV(svm_clf, parameters)                                                                                                                            

    clf.fit(V_train, y_train)

    #print clf.best_params_                                                                                                                                                         

    t1 = time.time()
    print 'Building Random Forest model ... Done', str(int((t1 - t0)*100)/100.)
    print ''

    p_val =clf.predict(V_val)

    print 'Training accuracy on validation set', accuracy_score(y_val, p_val)

    p_test = clf.predict(V_test)

    print 'Accuracy on testing set'

    print classification_report(y_test, p_test)
Esempio n. 29
0
def main(feature_pkl):
    print 'Loading data...'
    featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
    print 'Normalizing data...'
    trainFeatures = sklearn.preprocessing.normalize(trainFeatures.tocsc(), norm='l2', axis=0)
    testFeatures = sklearn.preprocessing.normalize(testFeatures.tocsc(), norm='l2', axis=0)
    #trainSplit, testSplit = splitTuple
    # Best estimator from grid search:
    clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
       verbose=0, warm_start=False)

    print 'Fitting model...'
    clf.fit(trainFeatures,trainTargets)

    # Use probabilities or decision function to generate a ranking    
    predicted_scores = clf.decision_function(testFeatures)
    with open(os.path.splitext(feature_pkl)[0]+'_testRanking.csv', 'w') as f:
        f.write('id\n')
        for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True):
            f.write('%d\n' % (item_id))

   # Turn estimator params into word clouds
    features, indices = zip(*sorted(featureIndex.iteritems(), key=operator.itemgetter(1)))
    coef_tuple = zip(clf.coef_[0],indices)
    coef_sort = sorted(coef_tuple, reverse=True)
    print 'Top 20 for illicit:'
    wordle_print(coef_sort[:20],features)
    print 'Top 20 for licit:'
    wordle_print(coef_sort[-20:],features)
def plot_sgd_classifier(num_samples, clt_std):
    #generation of data
    X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std)

    #fitting of data using logistic regression
    clf = SGDClassifier(loss='log', alpha=0.01)
    clf.fit(X, y)

    #plotting of data
    x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10)
    y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10)

    X_, Y_ = np.meshgrid(x_, y_)
    Z = np.empty(X_.shape)

    for (i, j), val in np.ndenumerate(X_):
        x1 = val
        x2 = Y_[i, j]
        conf_score = clf.decision_function([x1, x2])
        Z[i, j] = conf_score[0]

    levels = [-1.0, 0, 1.0]
    colors = 'k'
    linestyles = ['dashed', 'solid', 'dashed']

    ax = plt.axes()
    plt.xlabel('X1')
    plt.ylabel('X2')
    ax.contour(X_, Y_, Z, colors=colors,
               levels=levels, linestyles=linestyles, labels='Boundary')
    ax.scatter(X[:, 0], X[:, 1], c=y)
with open('ghost_train.csv', 'Ur') as f:
    data = list(tuple(rec) for rec in csv.reader(f, delimiter=' '))

ghostarray = np.array(data)
ghostarray = ghostarray.astype(np.float)
#print type(ghostarray[2][2])

X = ghostarray[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]
Y = np.ravel(ghostarray[:, [1]])

#print X
print Y

#fit classification model
clf = SGDClassifier(loss="hinge", penalty="l1")
clf.fit(X, Y)
SGDClassifier(alpha=0.00001,
              class_weight=None,
              epsilon=0.1,
              eta0=0.0,
              fit_intercept=True,
              l1_ratio=0.15,
              learning_rate='optimal',
              loss='hinge',
              n_iter=100,
              n_jobs=100,
              penalty='l1',
              power_t=0.5,
              random_state=False,
              rho=None,
              shuffle=False,
from sklearn.metrics import classification_report
from sklearn import metrics
import nltk
# Train the classifier

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
SGDC = SGDClassifier()
LSVC = LinearSVC()

LSVC.fit(X_train, y_train_sent)
accuracy_score_lsvc = metrics.accuracy_score(LSVC.predict(X_test), y_test_sent)
print('accuracy_score_lsvc = ' +
      str('{:4.2f}'.format(accuracy_score_lsvc * 100)) + '%')

SGDC.fit(X_train, y_train_sent)
accuracy_score_sgdc = metrics.accuracy_score(SGDC.predict(X_test), y_test_sent)
print('accuracy_score_sgdc = ' +
      str('{:4.2f}'.format(accuracy_score_sgdc * 100)) + '%')

import matplotlib.pyplot as plt
import datetime
dates = y_train['date'].unique()
converted_dates = list(
    map(datetime.datetime.strptime, dates,
        len(dates) * ['%Y-%m-%d']))
#y_axis = y_train['Confirmed']
#state = 'Connecticut'
#stateX = X_train.toarray()[(y_train['location']==state)] #& (y_train['date']< '2020-04-19')]
stateX = X_train.toarray()
#ylist = classifier_linear.predict(stateX)
class LogRegIntentClassifier(IntentClassifier):
    """Intent classifier which uses a Logistic Regression underneath"""

    config_type = LogRegIntentClassifierConfig

    def __init__(self, config=None, **shared):
        """The LogReg intent classifier can be configured by passing a
        :class:`.LogRegIntentClassifierConfig`"""
        super(LogRegIntentClassifier, self).__init__(config, **shared)
        self.classifier = None
        self.intent_list = None
        self.featurizer = None

    @property
    def fitted(self):
        """Whether or not the intent classifier has already been fitted"""
        return self.intent_list is not None

    @log_elapsed_time(logger, logging.DEBUG,
                      "LogRegIntentClassifier in {elapsed_time}")
    def fit(self, dataset):
        """Fits the intent classifier with a valid Snips dataset

        Returns:
            :class:`LogRegIntentClassifier`: The same instance, trained
        """
        logger.debug("Fitting LogRegIntentClassifier...")
        dataset = validate_and_format_dataset(dataset)
        self.load_resources_if_needed(dataset[LANGUAGE])
        self.fit_builtin_entity_parser_if_needed(dataset)
        self.fit_custom_entity_parser_if_needed(dataset)
        language = dataset[LANGUAGE]
        random_state = check_random_state(self.config.random_seed)

        data_augmentation_config = self.config.data_augmentation_config
        utterances, classes, intent_list = build_training_data(
            dataset, language, data_augmentation_config, self.resources,
            random_state)

        self.intent_list = intent_list
        if len(self.intent_list) <= 1:
            return self

        self.featurizer = Featurizer(
            config=self.config.featurizer_config,
            builtin_entity_parser=self.builtin_entity_parser,
            custom_entity_parser=self.custom_entity_parser,
            resources=self.resources)
        self.featurizer.language = language

        none_class = max(classes)
        try:
            x = self.featurizer.fit_transform(dataset, utterances, classes,
                                              none_class)
        except _EmptyDatasetUtterancesError:
            self.featurizer = None
            return self

        alpha = get_regularization_factor(dataset)
        self.classifier = SGDClassifier(random_state=random_state,
                                        alpha=alpha,
                                        **LOG_REG_ARGS)
        self.classifier.fit(x, classes)
        logger.debug("%s", DifferedLoggingMessage(self.log_best_features))
        return self

    @fitted_required
    def get_intent(self, text, intents_filter=None):
        """Performs intent classification on the provided *text*

        Args:
            text (str): Input
            intents_filter (str or list of str): When defined, it will find
                the most likely intent among the list, otherwise it will use
                the whole list of intents defined in the dataset

        Returns:
            dict or None: The most likely intent along with its probability or
            *None* if no intent was found

        Raises:
            :class:`snips_nlu.exceptions.NotTrained`: When the intent
                classifier is not fitted

        """
        return self._get_intents(text, intents_filter)[0]

    @fitted_required
    def get_intents(self, text):
        """Performs intent classification on the provided *text* and returns
        the list of intents ordered by decreasing probability

        The length of the returned list is exactly the number of intents in the
        dataset + 1 for the None intent

        Raises:
            :class:`snips_nlu.exceptions.NotTrained`: when the intent
                classifier is not fitted
        """
        return self._get_intents(text, intents_filter=None)

    def _get_intents(self, text, intents_filter):
        if isinstance(intents_filter, str):
            intents_filter = {intents_filter}
        elif isinstance(intents_filter, list):
            intents_filter = set(intents_filter)

        if not text or not self.intent_list or not self.featurizer:
            results = [intent_classification_result(None, 1.0)]
            results += [
                intent_classification_result(i, 0.0) for i in self.intent_list
                if i is not None
            ]
            return results

        if len(self.intent_list) == 1:
            return [intent_classification_result(self.intent_list[0], 1.0)]

        # pylint: disable=C0103
        X = self.featurizer.transform([text_to_utterance(text)])
        # pylint: enable=C0103
        proba_vec = self._predict_proba(X)
        logger.debug(
            "%s", DifferedLoggingMessage(self.log_activation_weights, text, X))
        results = [
            intent_classification_result(i, proba)
            for i, proba in zip(self.intent_list, proba_vec[0])
            if intents_filter is None or i is None or i in intents_filter
        ]

        return sorted(results, key=lambda res: -res[RES_PROBA])

    def _predict_proba(self, X):  # pylint: disable=C0103
        self.classifier._check_proba()  # pylint: disable=W0212

        prob = self.classifier.decision_function(X)
        prob *= -1
        np.exp(prob, prob)
        prob += 1
        np.reciprocal(prob, prob)
        if prob.ndim == 1:
            return np.vstack([1 - prob, prob]).T
        return prob

    @check_persisted_path
    def persist(self, path):
        """Persists the object at the given path"""
        path.mkdir()

        featurizer = None
        if self.featurizer is not None:
            featurizer = "featurizer"
            featurizer_path = path / featurizer
            self.featurizer.persist(featurizer_path)

        coeffs = None
        intercept = None
        t_ = None
        if self.classifier is not None:
            coeffs = self.classifier.coef_.tolist()
            intercept = self.classifier.intercept_.tolist()
            t_ = self.classifier.t_

        self_as_dict = {
            "config": self.config.to_dict(),
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": t_,
            "intent_list": self.intent_list,
            "featurizer": featurizer
        }

        classifier_json = json_string(self_as_dict)
        with (path / "intent_classifier.json").open(mode="w") as f:
            f.write(classifier_json)
        self.persist_metadata(path)

    @classmethod
    def from_path(cls, path, **shared):
        """Loads a :class:`LogRegIntentClassifier` instance from a path

        The data at the given path must have been generated using
        :func:`~LogRegIntentClassifier.persist`
        """
        path = Path(path)
        model_path = path / "intent_classifier.json"
        if not model_path.exists():
            raise LoadingError("Missing intent classifier model file: %s" %
                               model_path.name)

        with model_path.open(encoding="utf8") as f:
            model_dict = json.load(f)

        # Create the classifier
        config = LogRegIntentClassifierConfig.from_dict(model_dict["config"])
        intent_classifier = cls(config=config, **shared)
        intent_classifier.intent_list = model_dict['intent_list']

        # Create the underlying SGD classifier
        sgd_classifier = None
        coeffs = model_dict['coeffs']
        intercept = model_dict['intercept']
        t_ = model_dict["t_"]
        if coeffs is not None and intercept is not None:
            sgd_classifier = SGDClassifier(**LOG_REG_ARGS)
            sgd_classifier.coef_ = np.array(coeffs)
            sgd_classifier.intercept_ = np.array(intercept)
            sgd_classifier.t_ = t_
        intent_classifier.classifier = sgd_classifier

        # Add the featurizer
        featurizer = model_dict['featurizer']
        if featurizer is not None:
            featurizer_path = path / featurizer
            intent_classifier.featurizer = Featurizer.from_path(
                featurizer_path, **shared)

        return intent_classifier

    def log_best_features(self, top_n=50):
        if not hasattr(self.featurizer, "feature_index_to_feature_name"):
            return None

        log = "Top {} features weights by intent:".format(top_n)
        index_to_feature = self.featurizer.feature_index_to_feature_name
        for intent_ix in range(self.classifier.coef_.shape[0]):
            intent_name = self.intent_list[intent_ix]
            log += "\n\n\nFor intent {}\n".format(intent_name)
            top_features_idx = np.argsort(
                np.absolute(self.classifier.coef_[intent_ix]))[::-1][:top_n]
            for feature_ix in top_features_idx:
                feature_name = index_to_feature[feature_ix]
                feature_weight = self.classifier.coef_[intent_ix, feature_ix]
                log += "\n{} -> {}".format(feature_name, feature_weight)
        return log

    def log_activation_weights(self, text, x, top_n=50):
        if not hasattr(self.featurizer, "feature_index_to_feature_name"):
            return None

        log = "\n\nTop {} feature activations for: \"{}\":\n".format(
            top_n, text)
        activations = np.multiply(self.classifier.coef_,
                                  np.asarray(x.todense()))
        abs_activation = np.absolute(activations).flatten().squeeze()

        if top_n > activations.size:
            top_n = activations.size

        top_n_activations_ix = np.argpartition(abs_activation,
                                               -top_n,
                                               axis=None)[-top_n:]
        top_n_activations_ix = np.unravel_index(top_n_activations_ix,
                                                activations.shape)

        index_to_feature = self.featurizer.feature_index_to_feature_name
        features_intent_and_activation = [
            (self.intent_list[i], index_to_feature[f], activations[i, f])
            for i, f in zip(*top_n_activations_ix)
        ]

        features_intent_and_activation = sorted(features_intent_and_activation,
                                                key=lambda x: abs(x[2]),
                                                reverse=True)

        for intent, feature, activation in features_intent_and_activation:
            log += "\n\n\"{}\" -> ({}, {:.2f})".format(intent, feature,
                                                       float(activation))
        log += "\n\n"
        return log
Esempio n. 34
0
    train_a=OHE.transform(train_tree_index[:,index].reshape(-1,1))
    valid_a=OHE.transform(valid_tree_index[:,index].reshape(-1,1))
    if index %50==0:
        print index
        print 'train_a ----> one hot--->shape',train_a.shape
    data_x_train_2=sparse.hstack((data_x_train_2,train_a))
    data_x_valid_2=sparse.hstack((data_x_valid_2,valid_a))


df_feature_map.to_csv(save_path+"feature_important_mapping_cut_corr.csv")
sparse.save_npz(save_path+"data_x_train_cut_corr.npz",data_x_train)
sparse.save_npz(save_path+"data_x_valid_cut_corr.npz",data_x_valid)

eval_list=[(data_x_train,y_train),(data_x_valid,y_valid)]

gbm_clf = lgb.LGBMClassifier(
        boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1,
        max_depth=-1, n_estimators=1000, objective='binary',
        subsample=0.7, colsample_bytree=0.7, subsample_freq=1,
        learning_rate=0.02, min_child_weight=50,random_state=20180506,n_jobs=7)

gbm_clf.fit(data_x_train,y_train,eval_set=eval_list,eval_metric ='auc',early_stopping_rounds =100)
joblib.dump(gbm_clf, '/home/heqt/jupyter_project/model/gbm_clf_cnt_80W_corr.pkl')


SGDLR_clf=SGDClassifier(loss='log', penalty='l1', alpha=1.0, l1_ratio=0.15, 
     random_state=20150511,learning_rate='optimal',n_jobs=15)


SGDLR_clf.fit(data_x_train_LR,y_train)
joblib.dump(gbm_clf, '/home/heqt/jupyter_project/model/feature_SGDLR_clf.pkl')
from fully_connected.utils import treshhold_labels, normalize_data, load_monolithic

if __name__ == '__main__':
    train, dev, test = load_monolithic('data_monolithic_mfcc.pkl')

    X_train, S_train, Y_train = train
    X_dev, S_dev, Y_dev = dev
    X_test, S_test, Y_test = test
    X_train, X_dev, X_test = normalize_data(X_train, X_dev, X_test)
    Y_train, Y_dev, Y_test = treshhold_labels(Y_train, Y_dev, Y_test, .25)

    # rbf_feature = RBFSampler(gamma=1, n_components=800, random_state=1)
    rbf_feature = Nystroem(gamma=1, n_components=200, random_state=1)
    print('transform features')
    X_train_features = rbf_feature.fit_transform(X_train)
    X_dev_features = rbf_feature.transform(X_dev)
    print('finish')
    clf = SGDClassifier(max_iter=400, loss='log', n_jobs=-1, random_state=1,
                        alpha=0.00000001, tol=1e-9, early_stopping=False,
                        verbose=1, n_iter_no_change=40)

    clf.fit(X_train_features, Y_train)
    print('=== Training Set Performance ===')
    print(clf.score(X_train_features, Y_train))
    print(confusion_matrix(Y_train, clf.predict(X_train_features)))
    print(roc_auc_score(Y_train, clf.predict_proba(X_train_features)[:, 1]))
    print('=== Dev Set Performance ===')
    print(clf.score(X_dev_features, Y_dev))
    print(confusion_matrix(Y_dev, clf.predict(X_dev_features)))
    print(roc_auc_score(Y_dev, clf.predict_proba(X_dev_features)[:, 1]))
# In[ ]:


linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc


# In[ ]:


sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd


# In[ ]:


decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree

    print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%")
    print("     Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%")
    print("        Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%")
    print("      F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%")
    
#Let's split the data into train/test sets

from sklearn.model_selection import train_test_split

# test set size of 20% of the data and the random seed 42 <3
X_train, X_test, y_train, y_test = train_test_split(X_beng.toarray(),y_pred, test_size=0.2, random_state=42)

print("X_train size:", len(X_train))
print("X_test size:", len(X_test), "\n")

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import SGDClassifier

# SGD instance
sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3, random_state=42, n_jobs=4)
# train SGD
sgd_clf.fit(X_train, y_train)

# cross validation predictions
sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4)

# print out the classification report
classification_report("Stochastic Gradient Descent Report (Training Set)", y_train, sgd_pred)

Esempio n. 38
0
print y_train.value_counts(), '\n', y_test.value_counts(),

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test)

sgdc.fit(X_train, y_train)
sgdc_y_predict = sgdc.predict(X_test)

# print lr_y_predict,'\n',sgdc_y_predict

from sklearn.metrics import classification_report
print('Accuracy of LR Classifier:', lr.score(X_test, y_test))
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))

print('Accuarcy of SGD Classifier:', sgdc.score(X_test, y_test))
print(
    classification_report(y_test,
                          sgdc_y_predict,
Esempio n. 39
0
for i in range(iterations):
    X, y = shuffle(data, target)
    grd_lm = LogisticRegression()
    sgd = SGDClassifier(alpha=0.01, n_iter=100)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
    # It is important to train the ensemble of trees on a different subset
    # of the training data than the linear regression model to avoid
    # overfitting, in particular if the total number of leaves is
    # similar to the number of training samples
    X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train,
                                                                y_train,
                                                                test_size=0.5)

    mod_sgd = sgd.fit(X=X_train, y=y_train)

    #logistic regression
    mod_lr = grd_lm.fit(X=X_train, y=y_train)

    #embed features in space with gbt
    grd = GradientBoostingClassifier(n_estimators=10)
    grd.fit(X_train, y_train)
    #Hot encoding of the resulting leaves for each sample
    grd_enc = OneHotEncoder()
    grd_enc.fit(grd.apply(X_train)[:, :, 0])

    #create new dataset consisting of old features and the hot encoded gbt result
    X_train_enr_lr = np.hstack(
        (X_train_lr, grd_enc.transform(grd.apply(X_train_lr)[:, :,
                                                             0]).toarray()))
test_tags = y[cutoff:]

print "Training set size : " + str(training_sentences.shape[0])
print "Testing set size : " + str(test_sentences.shape[0])

# default SGD
clf = SGDClassifier(loss='log')
print "Algorithm", configuration["algorithm"]

if (configuration["algorithm"] == "SGD"):
    clf = SGDClassifier(loss='log')
elif (configuration["algorithm"] == "SVM"):
    clf = svm.SVC(decision_function_shape='ovo')
elif (configuration["algorithm"] == "MLP"):
    epoch = 200
    clf = MLPClassifier(solver='lbfgs',
                        alpha=1e-5,
                        hidden_layer_sizes=(17, 17, 17, 17),
                        random_state=1,
                        max_iter=epoch)

print 'Training started'
clf.fit(training_sentences, training_tags)
print 'Training completed'

print "Testing started"
score = clf.score(test_sentences, test_tags)
print "F1 Score"
print f1_score(test_tags, clf.predict(test_sentences), average='weighted')
print "Accuracy:", score
Esempio n. 41
0
sc = StandardScaler()

## Scailing the training set
X_train = sc.fit_transform(X_train)

## Scailing the test set.
X_test = sc.transform(X_test)

sgd_lr = SGDClassifier(loss="log",
                       penalty='l1',
                       alpha=0.0001,
                       fit_intercept=True,
                       max_iter=100,
                       learning_rate='constant',
                       eta0=0.01)

sgd_lr.fit(X=X_train, y=y_train)

# The purpose of logistic regression model is to find the probability of the testing example
# belonging to a certain class label.
probability = sgd_lr.predict_proba(X=X_test)
print(probability)

# The probability of each training example belonging to the class1, class2, and class3
# will sum up to 1.
sum_up = sgd_lr.predict_proba(X=X_test).sum(axis=1)
print(sum_up)

# Identifying labels predicted by the model.
class_predicted = sgd_lr.predict(X=X_test)
print(class_predicted)
vectorizer = CountVectorizer(
    stop_words='english'
)  # setting stop-words, so words like "the" and "it" are ignored
X = vectorizer.fit_transform(news['TITLE'])  # convert TITLE samples to vectors
y = news['CATEGORY']  # label
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3)  # 30% split

# model (best params established through gridsearch in notebook)
sgd = SGDClassifier(n_jobs=-1,
                    n_iter=10,
                    alpha=1e-05,
                    loss='hinge',
                    random_state=1234)
sgd.fit(X_train, y_train)


# custom function that inputs a news title, and outputs one of 4 specified categories
def title_to_category(title):
    categories = {
        'b': 'business',
        't': 'science and technology',
        'e': 'entertainment',
        'm': 'health'
    }
    pridicter = sgd.predict(vectorizer.transform([title]))
    return categories[pridicter[0]]


# testing a headline from The Onion
def SGDClasify():

    trainPath = sys.argv[1]
    testPath = sys.argv[2]

    files_list_train = pm.files(trainPath)
    files_list_test = pm.files(testPath)

    list_dict_train, words_list_train = pm.text_processing(files_list_train)

    bagOfWords_train, bernoulli_train = pm.generate_models(
        list_dict_train, words_list_train)

    list_dict_test, words_list_test = pm.text_processing(files_list_test)

    bagOfWords_test, bernoulli_test = pm.generate_models(
        list_dict_test, words_list_train)

    frame_train_BOW = pd.DataFrame(bagOfWords_train[1:])
    frame_test_BOW = pd.DataFrame(bagOfWords_test[1:])

    X_train_BOW = frame_train_BOW.iloc[:, :-1]
    y_train_BOW = frame_train_BOW.iloc[:, -1]

    X_test_BOW = frame_test_BOW.iloc[:, :-1]
    y_test_BOW = frame_test_BOW.iloc[:, -1]

    frame_train_Ber = pd.DataFrame(bernoulli_train[1:])
    frame_test_Ber = pd.DataFrame(bernoulli_test[1:])

    X_train_Ber = frame_train_Ber.iloc[:, :-1]
    y_train_Ber = frame_train_Ber.iloc[:, -1]

    X_test_Ber = frame_test_Ber.iloc[:, :-1]
    y_test_Ber = frame_test_Ber.iloc[:, -1]

    #sklearn.linear_model.SGDClassifier(loss='hinge’, penalty=’l2’, alpha=0.0001,l1_ratio=0.15, fit_intercept=True,
    #                                   max_iter=None,tol=None, shuffle=True, verbose=0, epsilon=0.1,n_jobs=None,
    #                                   random_state=None, learning_rate='optimal’, eta0=0.0, power_t=0.5,
    #                                   early_stopping=False, validation_fraction=0.1,n_iter_no_change=5,
    #                                   class_weight=None,warm_start=False, average=False, n_iter=None)

    SGDClassifierModelBOW = SGDClassifier(max_iter=1500)
    print(SGDClassifierModelBOW)
    #print(SGDClassifierModelBOW)
    SGDClassifierModelBOW.fit(X_train_BOW, y_train_BOW)

    #Calculating Prediction
    y_pred_BOW = SGDClassifierModelBOW.predict(X_test_BOW)

    accuracyBOW = accuracy_score(y_test_BOW, y_pred_BOW)
    print('The Accuracy with BOW :', accuracyBOW * 100)

    SGDClassifierModelBer = SGDClassifier(max_iter=1500)
    SGDClassifierModelBer.fit(X_train_Ber, y_train_Ber)

    #Calculating Prediction
    y_pred_Ber = SGDClassifierModelBer.predict(X_test_Ber)

    accuracyBer = accuracy_score(y_test_Ber, y_pred_Ber)
    print('The Accuracy with Berboulli Model :', accuracyBer * 100)

    # Applying Grid Search to find the best model and the best parameters
    from sklearn.model_selection import GridSearchCV
    parameters = [
        {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
        },
    ]
    grid_search_BOW = GridSearchCV(estimator=SGDClassifierModelBOW,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10,
                                   n_jobs=-1)
    grid_search_BOW = grid_search_BOW.fit(X_train_BOW, y_train_BOW)

    accuracyBOW = grid_search_BOW.best_score_
    print('The Accuracy with BOW with grid search :', accuracyBOW * 100)

    grid_search_Ber = GridSearchCV(estimator=SGDClassifierModelBer,
                                   param_grid=parameters,
                                   scoring='accuracy',
                                   cv=10,
                                   n_jobs=-1)
    grid_search_Ber = grid_search_Ber.fit(X_train_BOW, y_train_BOW)

    accuracyBer = grid_search_Ber.best_score_
    print('The Accuracy with Bernoulli with grid search :', accuracyBer * 100)

    from sklearn.metrics import classification_report

    print('Classification Report for BOW')
    print(classification_report(y_test_BOW, y_pred_BOW.round()))

    print('Classification Report for Bernoulli')
    print(classification_report(y_test_Ber, y_pred_Ber.round()))
Esempio n. 44
0
    train_x_normalize, test_x_normalize, train_y, test_y = train_test_split(
        X_kbest_features, y, train_size=0.8, test_size=0.2)
    #run LSA
    '''lsa = helper.run_lsa(train_x_normalize)
    train_x_normalize = lsa.transform(train_x_normalize)
    test_x_normalize = lsa.transform(test_x_normalize)
    val_kbest_features = lsa.transform(val_kbest_features)'''

    clf = SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3)

    acc = model_selection.cross_val_score(clf,
                                          X_kbest_features,
                                          y,
                                          cv=5,
                                          scoring='accuracy')
    print("accuracy", acc.mean())

    start_time = time.time()
    clf.fit(train_x_normalize, train_y)
    print("--- %s runtime in seconds ---" % (time.time() - start_time))
    # predict
    clf_pred = clf.predict(test_x_normalize)
    val_pred = clf.predict(val_kbest_features)

    #write to validation file
    helper.generate_prediction_csv(val_pred)

    # evaluation on testt set
    print(metrics.classification_report(test_y, clf_pred))
    print(metrics.accuracy_score(test_y, clf_pred))
Esempio n. 45
0
# Split dataset in testing and training datasets. MNIST already comes separated
# into testing and training datasets: the first 60k images are training, and
# the last 10k are testing.

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Shuffle training sets. Good idea since some learning algorithms are sensitive
# to the order of training sets. Not always a good idea, for example, on time
# series data.

import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Creating a "5-detector". Create the target vectors for the classification
# task:
y_train_5 = (y_train == 5)  # true for all 5s, False for all other digits.
y_test_5 = (y_test == 5)

# Pick a classifier to train. Will use Stochastic Gradient Descent.

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

# Use model to detect images of 5s:
sgd_clf.predict([some_digit])  # output is true, the classifier properly pred-
# icts that the image is 5.
Esempio n. 46
0
#Load the dataset
for line in HouseFile:  
    matchObj = re.match( r'\d+ \w+ \w+,\w+,(\w+),\w+,(\d+),(\d+),(\d+),\w+,\w+ \w+ \d+ \d+:\d+:\d+ \w+ \d+,(\d+)', line, re.M|re.I)

    if matchObj:
        if not(matchObj.group(1) == 0) or (matchObj.group(1) == 0) or (matchObj.group(3) == 0) or (matchObj.group(4) == 0)  or (matchObj.group(5) == 0): 
            new_value = np.matrix([float(matchObj.group(2)),float(matchObj.group(3)),float(matchObj.group(4))])
            info_list = np.concatenate((info_list, new_value), axis=0)
            new_price = np.matrix([float(matchObj.group(5))])
            price_matrix = np.concatenate((price_matrix, new_price), axis=0)

X = info_list
y = price_matrix.transpose()
list_y = np.array(y)[0].tolist()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)  # Don't cheat - fit only on training dataxs

clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X, list_y)
SGDClassifier(alpha=0.01, average=True, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=1000, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

# Create your views here.

Esempio n. 47
0
 def fit(self, X, y, *args, **kw):
     X = sp.csr_matrix(X)
     return SGDClassifier.fit(self, X, y, *args, **kw)
Esempio n. 48
0
# partition the data into training and testing splits, using 75%
# of the data for training and the remaining 25% for testing
print("[INFO] constructing training/testing split...")
(trainData, testData, trainLabels,
 testLabels) = train_test_split(np.array(data),
                                labels,
                                test_size=0.25,
                                random_state=42)

# train a Stochastic Gradient Descent classifier using a softmax
# loss function and 10 epochs
model = SGDClassifier(loss="log", random_state=42, n_iter=10)
print("model", model, "trainData", trainData, "testData", testData,
      "trainLabels", trainLabels, "testLabels", testLabels)
model.fit(trainData, trainLabels)

# evaluate the classifier
print("[INFO] evaluating classifier...")
predictions = model.predict(testData)
print("88", predictions)  ##edited
print(classification_report(testLabels, predictions, target_names=le.classes_))

# to demonstrate that our classifier actually "learned" from
# our training data, randomly sample a few training images
idxs = np.random.choice(np.arange(0, len(trainData)), size=(5, ))

# loop over the training indexes
for i in idxs:
    # predict class probabilities based on the extracted color
    # histogram
Esempio n. 49
0
        content = regex1.sub(' ', content)
        content = regex2.sub(' ', content)
        content = regex3.sub(' ', content)
        X_test.append(content)

    outF1 = open("unigram.output.txt", "w")
    outF2 = open("unigramtfidf.output.txt", "w")
    outF3 = open("bigram.output.txt", "w")
    outF4 = open("bigramtfidf.output.txt", "w")
    """ unigram  """
    unigram = CountVectorizer(stop_words=stopwords)
    X_train_unigram = unigram.fit_transform(X)
    X_test_unigram = unigram.transform(X_test)

    sgd1 = SGDClassifier(penalty='l1')
    sgd1.fit(X_train_unigram, Y_train)
    Y_test = sgd1.predict(X_test_unigram)
    for result in Y_test:
        outF1.write(str(result))
        outF1.write("\n")

    outF1.close()
    """ unigram with tfidf  """
    tfidf_unigram = TfidfVectorizer(stop_words=stopwords)
    X_train_tf_unigram = tfidf_unigram.fit_transform(X)
    X_test_tf_unigram = tfidf_unigram.transform(X_test)

    sgd2 = SGDClassifier(penalty='l1')
    sgd2.fit(X_train_tf_unigram, Y_train)
    Y_test = sgd2.predict(X_test_tf_unigram)
    for result in Y_test:
#Train word2vec on test tweets
# imdb_w2v.train(x_test)

#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)

# Use classification algorithm (i.e. Stochastic Logistic Regression) on training set, then assess model performance on test set
from sklearn.linear_model import SGDClassifier
scores = []
precision_scores=[]
recall_scores=[]
for x in range(10):
    print x
    lr = SGDClassifier(loss='log', penalty='l1')
    lr.fit(train_vecs, y_train)
    scores.append(lr.score(test_vecs, y_test))
    precision_scores.append(precision_score(y_test, lr.predict(test_vecs)))
    recall_scores.append(recall_score(y_test, lr.predict(test_vecs)))

print "accuracy: %f" % np.mean(scores)
print "precision: %f" % np.mean(precision_scores)
print "recall_scores: %f" % np.mean(recall_scores)

# Try a naivebayes classifier
from sklearn.naive_bayes import GaussianNB

scores = []
precision_scores=[]
recall_scores=[]
for x in range(10):
from sklearn.linear_model import SGDClassifier
X = [[0., 0.], [1., 1.]]
y = [0, 1]
clf = SGDClassifier(loss="hinge", penalty="l2")
clf.fit(X, y)
print clf.predict([[2., 2.]])
#To get the signed distance to the hyperplane use SGDClassifier.decision_function
print clf.decision_function([[2., 2.]])

clf1 = SGDClassifier(loss="log").fit(X, y)
clf1.predict_proba([[1., 1.]])
Esempio n. 52
0
#                     learning_rate_init=0.1, shuffle=True)

# classifier = MLPClassifier(solver='lbfgs', alpha=1e-5,
# 	hidden_layer_sizes=(30, 30), random_state=1, verbose=True)
# mlp.fit(train_x, train_y)

# score = mlp.score(vali_x, vali_y)
# print(score)

clf = SGDClassifier(loss="log",
                    penalty="l2",
                    max_iter=150,
                    tol=1e-4,
                    shuffle=True,
                    verbose=1)
clf.fit(train_x, train_y)
score = clf.score(vali_x, vali_y)
print(score)

# with open('data_B.csv', 'r') as f:
# 	data = f.readlines()

# f.close()

# raw_data = []

# for i in data:
# 	i = i.strip('\n')
# 	date, daylength, time, pid, label = i.split(',')
# 	temp_x = []
# 	temp_y = []
Esempio n. 53
0
from sklearn.datasets import load_iris
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

iris=load_iris()
X_iris, y_iris = iris.data, iris.target
X, y = X_iris[:, :2], y_iris

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33)

scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = clf.predict(X_train)
print metrics.accuracy_score(y_train, y_train_predict)

y_predict = clf.predict(X_test)
print metrics.accuracy_score(y_test, y_predict)

print metrics.classification_report(y_test, y_predict, target_names = iris.target_names)
Esempio n. 54
0
X.pop('Parch')

X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, random_state=0)

#my_model_reg=DecisionTreeRegressor()
#my_model_reg.fit(X_train,Y_train)
#pred=my_model_reg.predict(X_valid)
#print('MAE DTR: ',mean_absolute_error(pred,Y_valid))

class_model = DecisionTreeClassifier(max_leaf_nodes=54)
class_model.fit(X_train, Y_train)
pred_class = class_model.predict(X_valid)
print('MAE DTC: ', mean_absolute_error(pred_class, Y_valid))

SGD_model = SGDClassifier()
SGD_model.fit(X_train, Y_train)
pred_SGD = SGD_model.predict(X_valid)
print('MAE SGD: ', mean_absolute_error(pred_SGD, Y_valid))

RF_model = RandomForestClassifier(max_depth=17, random_state=0)
RF_model.fit(X_train, Y_train)
pred_RF = RF_model.predict(X_valid)
print('MAE RF: ', mean_absolute_error(pred_RF, Y_valid))

KN_model = KNeighborsClassifier(n_neighbors=55,
                                weights='distance',
                                algorithm='auto')
KN_model.fit(X_train, Y_train)
pred_KN = KN_model.predict(X_valid)
print('MAE KN: ', mean_absolute_error(pred_KN, Y_valid))
Esempio n. 55
0
def main():
    #TRAIN
    # with codecs.open("train.csv", 'r', encoding="utf-8-sig") as file:
    # 	lines = [ x.strip() for x in file.readlines() ]

    # 	train = map( lambda s: s.split(","), lines)

    # 	Y, X = zip(*train)

    # 	model = ResearchModel()
    # 	model.train(X, Y, n_epochs=2000)
    # 	model.save("weightings.pickle")

    #TEST
    # with codecs.open("validate.csv", 'r', encoding="utf-8-sig") as file:
    # 	lines = [ x.strip() for x in file.readlines() ]

    # 	test = map( lambda s: s.split(","), lines)

    # 	Y, X = zip(*test)

    # 	# This is messy but whatevs
    # 	model = ResearchModel()
    # 	model.train(X, Y, n_epochs=0) # This isn't actually training im only initializing values

    # 	# Set the models WEIGHTINGS
    # 	# model.load("weightings.pickle")
    # 	model.weightings = WEIGHTINGS;
    # 	accuracy = model.test(X,Y)

    # 	print("ACCURACY = %f" % accuracy)
    # 	print(model.weightings)

    #SLACK
    with codecs.open("all.csv", 'r', encoding="utf-8-sig") as file:
        lines = [x.strip() for x in file.readlines()]

        data = map(lambda s: s.split(","), lines)

        Y, X = zip(*data)

        # Load weghtings
        with open("weightings.pickle", "rb") as file:
            WEIGHTINGS = pickle.load(file)

        print(WEIGHTINGS)
        # Make Ymap
        Ymap = {}
        i = 0
        for y in Y:
            if not y in Ymap:
                Ymap[y] = i
                i += 1

        # Convert
        X, Y = map(lambda x: get_sentence_vector(x),
                   X), map(lambda y: Ymap[y], Y)

        model = SGDClassifier()
        model.fit(X, Y)

        print("Model ready")

        while True:
            events = slack_client.rtm_read()

            for event in events:
                if ('channel' in event and 'text' in event
                        and event['user'] == 'UCRPZ9R4K'
                        and event.get('type') == 'message'):

                    channel = event['channel']
                    text = event['text'].replace("\n", "").replace("?", "")
                    input_vector = get_sentence_vector(text.lower())

                    prediction = model.predict([input_vector])

                    label = None
                    for k in Ymap:
                        if Ymap[k] == prediction[0]:
                            label = k

                    tokens = pos_tag(word_tokenize(text))
                    chunks = cp.parse(tokens)

                    # Extract entities
                    meta = [
                        " ".join([token[0] for token in ch.leaves()])
                        for ch in filter(lambda x: x.label() == "ENTITY",
                                         chunks.subtrees())
                    ]

                    # Remove empty meta
                    meta = [m for m in meta if len(m) > 0]

                    print("Predicted intent: %s, entities: %s" % (label, meta))

                    slack_client.api_call(
                        'chat.postMessage',
                        channel=channel,
                        text=
                        "I predicted the intent to be: %s with the entities: %s\n"
                        % (label, ", ".join(meta)),
                        as_user='******')
Esempio n. 56
0
def fitness2(self, mode, solution, data, dummiesList, createDummies,
             normalize):

    matrix_length = len(np.unique(self.data[self.target]))

    if mode == 'sgd':
        model = SGDClassifier(class_weight='balanced',
                              loss='modified_huber',
                              random_state=1)
    elif mode == 'svr':
        model = SVC(kernel='linear', class_weight='balanced', probability=True)
    elif mode == 'rdf':
        model = SVC(kernel='rbf', class_weight='balanced', probability=True)
    elif mode == 'pol':
        model = SVC(kernel='poly', class_weight='balanced', probability=True)
    elif mode == 'rdc':
        # model = RandomForestClassifier(n_estimators=16, bootstrap=False, class_weight='balanced', random_state=1)
        # model = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy',
        #                                max_depth=12, max_features=0.2, max_leaf_nodes=None,
        #                                min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=20,
        #                                min_weight_fraction_leaf=0.0, n_estimators=512, n_jobs=-1, oob_score=False,
        #                                random_state=3, verbose=False, warm_start=False)
        simpleimputer = SimpleImputer(add_indicator=False,
                                      copy=True,
                                      fill_value=None,
                                      missing_values=np.nan,
                                      strategy="median",
                                      verbose=0)
        standardscaler = StandardScaler(copy=True,
                                        with_mean=True,
                                        with_std=True)
        randomforestclassifier = RandomForestClassifier(
            bootstrap=False,
            class_weight=None,
            criterion="gini",
            max_depth=None,
            max_features=0.21975649694764154,
            max_leaf_nodes=None,
            min_impurity_decrease=0,
            min_impurity_split=None,
            min_samples_leaf=2,
            min_samples_split=4,
            min_weight_fraction_leaf=0.0,
            n_estimators=300,
            n_jobs=1,
            oob_score=False,
            random_state=1,
            verbose=0,
            warm_start=False)
        model = Pipeline(memory=None,
                         steps=[('simpleimputer', simpleimputer),
                                ('standardscaler', standardscaler),
                                ('randomforestclassifier',
                                 randomforestclassifier)])
    elif mode == 'dtc':
        model = DecisionTreeClassifier(class_weight='balanced', random_state=1)
    elif mode == 'gdc':
        model = GradientBoostingClassifier(random_state=1)
    elif mode == 'etc':
        model = ExtraTreesClassifier(class_weight='balanced', random_state=1)
    elif mode == 'adc':
        model = AdaBoostClassifier(random_state=1)
    elif mode == 'bac':
        model = BaggingClassifier(random_state=1)
    elif mode == 'lda':
        model = LinearDiscriminantAnalysis()
    elif mode == 'qda':
        model = QuadraticDiscriminantAnalysis()
    elif mode == 'gnb':
        model = GaussianNB()
    elif mode == 'rrc':
        model = RidgeClassifier(class_weight='balanced')
    else:
        model = LogisticRegression(solver='liblinear',
                                   C=10.0,
                                   class_weight='balanced')
    k = model_selection.StratifiedKFold(5)
    if not any(solution):
        solution[random.randint(0, len(solution) - 1)] = True
    try:
        tab_data, tab_val = tab.get([int(x) for x in solution], self.tab_data,
                                    self.tab_vals)
        tab_val = np.array(tab_val)
        accuracy = (getTotalTruePositive(tab_val) + getTotalTrueNegative(tab_val)) / \
                   (getTotalTruePositive(tab_val) + getTotalTrueNegative(tab_val) +
                    getTotalFalsePositive(tab_val) + getTotalFalseNegative(tab_val))
        precision_tab = []
        recall_tab = []
        for i in range(len(tab_val)):
            a = getTruePositive(tab_val, i) / (getFalsePositive(tab_val, i) +
                                               getTruePositive(tab_val, i))
            b = getTruePositive(tab_val, i) / (getFalseNegative(tab_val, i) +
                                               getTruePositive(tab_val, i))
            precision_tab.append(a)
            recall_tab.append(b)
        precision = sum(precision_tab) / len(precision_tab)
        recall = sum(recall_tab) / len(recall_tab)
        fscore = 2 * (1 / ((1 / precision) + (1 / recall)))
        matrix = tab_val
        tmp = self.data.drop([self.target], axis=1)
        tmp = tmp.iloc[:, solution]
        cols = tmp.columns
        self.tab_find = self.tab_find + 1
    except:
        matrix = np.zeros((matrix_length, matrix_length), dtype=int)
        X, y, cols = ready(self, solution, data, dummiesList, createDummies,
                           normalize)
        originalclass = []
        predictedclass = []
        for train_index, test_index in k.split(X, y):  # Split in X
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            if mode == ('knn' or 'dct' or 'gbc' or 'lda' or 'qda' or 'adc'
                        or 'bac'):
                if mode == 'knn':
                    model = KNeighborsClassifier(
                        n_neighbors=int(len(X_train)**(1 / 2)))
                sm = SMOTE(sampling_strategy='auto')
                X_train, y_train = sm.fit_resample(X_train, y_train)

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            originalclass.extend(y_test)
            predictedclass.extend(y_pred)

            matrix += confusion_matrix(y_test, y_pred)

        accuracy = (getTotalTruePositive(matrix) + getTotalTrueNegative(matrix)) / \
                   (getTotalTruePositive(matrix) + getTotalTrueNegative(matrix) +
                    getTotalFalsePositive(matrix) + getTotalFalseNegative(matrix))
        precision, recall, fscore, support = s(originalclass,
                                               predictedclass,
                                               average='macro')
        self.tab_data, self.tab_vals = tab.add([int(x) for x in solution],
                                               matrix.tolist(), self.tab_data,
                                               self.tab_vals)
        self.tab_insert = self.tab_insert + 1

    return accuracy, recall, precision, fscore, cols, matrix, self
Esempio n. 57
0
X, y = fiasco.createXY(inputFile)

X_train, X_test, y_train, y_test = fiasco.createTrainTestSplit()

# Defining classification methods

clf1 = NearestCentroid(
    metric='manhattan', shrink_threshold=500
)  # Performs better with the manhattan metric and a high shrink threshold. Remove params to see result with default params.
clf1.fit(X_train, y_train)

clf2 = SGDClassifier(
    loss="modified_huber", penalty="l1", max_iter=10000
)  # Modified huber has a higher tolerance to outliers than default. Penalty l1, adds a penalty equal the abosolute value.
clf2.fit(X_train, y_train)

clf3 = svm.SVC()
clf3.fit(X_train, y_train)

clf4 = svm.LinearSVC()
clf4.fit(X_train, y_train)

clf5 = KNeighborsClassifier()
clf5.fit(X_train, y_train)

# Plotting results

plt = fiasco.plot_learning_curve(
    clf1, "Nearest Centroid, metric = manhattan, shrink threshold = 500", X, y)
plt = fiasco.plot_learning_curve(clf2, "SGD Classifier", X, y)
Esempio n. 58
0
    stack_test /= n_folds

    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_vec_lr_classfiy_{}'.format(label)] = stack[:, 0]

########################### SGD(随机梯度下降) ################################
print('sgd stacking')
stack_train = np.zeros((len(train_uid), 1))
stack_test = np.zeros((len(test_uid), 1))
score_va = 0

for i, (tr, va) in enumerate(folds.split(train_feature, score)):
    print('stack:%d/%d' % ((i + 1), n_folds))
    sgd = SGDClassifier(random_state=1017, loss='log')
    sgd.fit(train_feature[tr], score[tr])
    score_va = sgd.predict_proba(train_feature[va])[:, 1]
    score_te = sgd.predict_proba(test_feature)[:, 1]
    print('得分' +
          str(mean_squared_error(score[va], sgd.predict(train_feature[va]))))
    stack_train[va, 0] = score_va
    stack_test[:, 0] += score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])

df_stack['pack_tfidf_vec_sgd_classfiy_{}'.format(label)] = stack[:, 0]

########################### pac(PassiveAggressiveClassifier) ################################
print('sgd stacking')
stack_train = np.zeros((len(train_uid), 1))
stack_test = np.zeros((len(test_uid), 1))
Esempio n. 59
0
class PredictiveMarketVariables(MarketVariables):
    def __init__(self, shape, options):
        super(PredictiveMarketVariables, self).__init__(shape, options)
        self.shape = (self.shape[0] + 3, )
        self.observations = deque(maxlen=15000)
        self.mids = deque(maxlen=15000)
        self.regressor = ElasticNet(warm_start=True, alpha=20, l1_ratio=0)
        self.classifier = SGDClassifier(warm_start=True,
                                        max_iter=100,
                                        alpha=0.001,
                                        l1_ratio=0.5,
                                        penalty='elasticnet')
        self.is_fitted = False
        self.train_k = 1

    def transform(self, observation):
        if self.train_k == 1:
            prev_mid = (observation[0][Q_BID] + observation[0][Q_ASK]) / 2
        else:
            prev_mid = (self.prev_ask + self.prev_bid) / 2

        obs = MarketVariables.transform(self, observation)
        mid = (observation[0][Q_BID] + observation[0][Q_ASK]) / 2

        self.mids.append(mid)
        self.observations.append((obs[0:4]))

        self.train_k += 1
        if not self.is_fitted:
            if self.train_k == 1000:
                self.train()
        elif self.train_k % 10000 == 0:
            self.train()

        if self.is_fitted:
            mid_diff = (mid - prev_mid) / mid
            return obs + self.predict(obs[0:4] + (mid_diff, )) + (mid_diff, )
        else:
            return obs + (
                0, 0, 0)  # Zero percentage change and zero class (no change)

    def train(self):
        y = pd.Series(list(self.mids))
        y_pct = y.pct_change().fillna(0)
        y_sign = np.sign(y_pct)

        X = np.array(self.observations)
        X = np.concatenate(
            [X, y_pct.shift(-1).fillna(0).values.reshape(-1, 1)], axis=1)
        X_scaled = MinMaxScaler(feature_range=(-1, 1)).fit_transform(X)
        #t = time.time()
        self.regressor.fit(X_scaled, y_pct)
        #print('Reg fit time:{:.2f}'.format(time.time()-t))

        #t = time.time()
        self.classifier.fit(X_scaled, y_sign)
        #print('Class fit time:{:.2f}'.format(time.time()-t))

        self.is_fitted = True

    def predict(self, obs):
        obs = np.array(obs).reshape(1, -1)
        return self.regressor.predict(obs)[0], self.classifier.predict(obs)[0]
Esempio n. 60
0
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

# In[638]:

#random training data
import numpy as np
X_train1 = np.random.permutation(X_train)
Y_train1 = np.random.permutation(y_train)

# In[639]:

from sklearn.linear_model import SGDClassifier
lm = SGDClassifier()
lm.fit(X_train, y_train)

# In[640]:

#Find pridicted value
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(model, X_train1, Y_train1, cv=5)

# In[641]:

y_pred = y_pred.astype(int)

# In[642]:

y_pred