Beispiel #1
0
t_val = df_val['target']
x_val = df_val.drop('target',axis=1)

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN,SMOTETomek
from imblearn.under_sampling import ClusterCentroids


#测试抽样
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import cross_validate
from sklearn import linear_model
from sklearn.neural_network import MLPClassifier

LogisticRegression = linear_model.LogisticRegression(solver='lbfgs',max_iter=500,tol=1e-3)
SGD = linear_model.SGDClassifier(loss="hinge", penalty="l2", max_iter=100,tol=1e-3)
RF = RandomForestClassifier(10)
GBD = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10)
nn = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100,50,50,20, 2),tol=1e-3)
nn2 = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(10, 2),tol=1e-3)

mds = [LogisticRegression,SGD,RF,GBD,nn,nn2]
mds_name = ['LogisticRegression','SGD','RF','GBD','nn','nn2']
sample_name = ['ClusterCentroids','SMOTEENN','SMOTETomek','SMOTE','SMOTE borderline1','SMOTE borderline2','ADASYN']
sample_methods = [ClusterCentroids(),SMOTEENN(),SMOTETomek(),SMOTE(),SMOTE(kind='borderline1'),SMOTE(kind='borderline2'),ADASYN()]


import time
sample_roc = []
i=0
for s in sample_methods:
Beispiel #2
0
import features.zip_codes
import evaluation


if __name__ == '__main__':
    data_train = pd.read_csv("../data/zip.train", header = None, sep =" ")
    cleaned_train_data = data_train.dropna(axis=1, thresh=2)

    input_data = cleaned_train_data.iloc[:, 1:].values
    targets = cleaned_train_data[0].values

    input_data2 = features.zip_codes.multires(input_data)

    # log reg with simple feature set
    print("Evaluating simple feature set")
    log_reg = lm.SGDClassifier(n_jobs=1, loss="log", max_iter = 50)

    classifier.fit(log_reg, input_data, targets)
    pred, pred_proba = classifier.predict(log_reg, input_data)

    evaluation.print_errors(targets, pred)
    print("")

    # log reg with advanced feature set
    print("Evaluating modified feature set")
    log_reg2 = lm.SGDClassifier(n_jobs=1, loss="log", max_iter=50)

    classifier.fit(log_reg2, input_data2, targets)
    pred, pred_proba = classifier.predict(log_reg2, input_data2)

    evaluation.print_errors(targets, pred)
Beispiel #3
0
import matplotlib.pyplot as plt
from sklearn import svm, linear_model, linear_model

x_pos = np.random.uniform(3.8, 4.2, (10000, 2))
x_neg = np.random.uniform(-4.2, -3.8, (100, 2))

y_pos = np.full(10000, 0)
y_neg = np.full(100, 1)

x = np.concatenate((x_pos, x_neg), axis=0)
y = np.concatenate([y_pos, y_neg])

#svc = svm.SVC(kernel='linear', C=10000).fit(x, y)
hinglesgd = linear_model.SGDClassifier(loss="hinge",
                                       penalty="l2",
                                       shuffle=True,
                                       average=10,
                                       alpha=0.00001).fit(x, y)
logsgd = linear_model.SGDClassifier(loss="log",
                                    penalty="l2",
                                    shuffle=True,
                                    average=10,
                                    alpha=0.00001).fit(x, y)

# create a mesh to plot in
h = .02  # step size in the mesh
#x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
#y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
x_min, x_max = -6, 6
y_min, y_max = -6, 6
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Beispiel #4
0
                                r'\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b',
                            ))])),
                ('features_tweets', TweetsToFeatures()),
            ])

            clasificador_usado = Pipeline([
                ('features', feature_union),
                ('clf', naive_bayes.MultinomialNB(alpha=0.01)),
            ])
        elif args.clasificador == "LB2":
            clasificador_usado = Mayoria()
        elif args.clasificador == "MNB":
            clasificador_usado = naive_bayes.MultinomialNB()
            parameters_grid_search = parameters_mnb
        elif args.clasificador == "SGD":
            clasificador_usado = linear_model.SGDClassifier(shuffle=True)
        else:  # "SVM"
            clasificador_usado = svm.SVC()
            parameters_grid_search = parameters_svm

        if args.grid_search:
            grid_search = GridSearchCV(clasificador_usado,
                                       parameters_grid_search,
                                       cv=5,
                                       verbose=2,
                                       n_jobs=8)

            grid_search.fit(features, clases)
            print("Mejores parámetros encontrados para " + args.clasificador +
                  ":")
            for nombre_parametro, valor_parametro in clasificador_usado.get_params(
Beispiel #5
0
train.dropna(inplace=True)
ans = train.pop('target')

# process test
test.fillna(value=0, inplace=True)

total = train.append(test)

# scalization
scaler = MinMaxScaler()
scaler.fit(total)

# sep total to train, test
train = total[:pretrain.shape[0]]

test = total[pretrain.shape[0]:]

# can modify variable
clf = linear_model.SGDClassifier(n_jobs=-1, verbose=1)
clf.fit(train, ans.astype('int'))
result = clf.predict(test)

# output result
result = pd.DataFrame(
    {
        'id': [str(i) for i in range(0, len(result))],
        'target': result
    },
    columns=['id', 'target'])
result.to_csv('result.csv', index=False, quoting=2)
Beispiel #6
0
    print(i)
    for j in range(ch):
        spike_interval = np.int32(steps / (train_data[i, j] * 50 + 0.0001))
        jitter = np.random.randint(20)
        spikes = np.zeros(steps - 20)
        spikes[jitter::spike_interval] = 60
        train_in_spikes[j, :-20] = spikes
    reservoir_network.add_input(train_in_spikes)
    rate_coding = reservoir_network.simulate()
    #X[i,:] = rate_coding
    X[i, :] = rate_coding / (np.max(rate_coding) + 0.0001)

#maxX = (np.max(X) + 0.0001)
#X = X/maxX
print("training linear model")
clf = linear_model.SGDClassifier(max_iter=100000, tol=1e-3)
clf.fit(X, train_labels)

X_test = np.zeros((test_labels.shape[0], reservoir_network.n_nodes))
test_in_spikes = np.zeros((ch, steps))
initial_activities = np.zeros(test_labels.shape[0])
extra_activities = np.zeros(test_labels.shape[0])
for i in range(test_labels.shape[0]):
    print(i)
    for j in range(ch):
        spike_interval = np.int32(steps / (test_data[i, j] * 50 + 0.0001))
        jitter = np.random.randint(20)
        spikes = np.zeros(steps - 20)
        spikes[jitter::spike_interval] = 60
        test_in_spikes[j, :-20] = spikes
    reservoir_network.add_input(test_in_spikes)
Beispiel #7
0
# Again with important features
clf2 = RandomForestClassifier(n_estimators=500, max_depth=30, random_state=0).fit(data[data.columns[clf.feature_importances_>0.01]], Y_train)
# Predictions
results=clf2.predict(test[data.columns[clf.feature_importances_>0.01]])
results = pd.DataFrame({'outcome':results[:]})
# Creating submission file
sub=pd.read_csv('sample_submission.csv',header=0)
outcome={0:"no", 1:"yes"}
sub['outcome'] = results["outcome"].map(outcome)
sub.to_csv('random_for_imp.csv')
#Score: 75.6083

# SGDClassifier
from sklearn import linear_model
clf = linear_model.SGDClassifier(max_iter=1000).fit(data,Y_train)
results=clf.predict(test)
results = pd.DataFrame({'outcome':results[:]})
# Creating submission file
sub=pd.read_csv('sample_submission.csv',header=0)
outcome={0:"no", 1:"yes"}
sub['outcome'] = results["outcome"].map(outcome)
sub.to_csv('sgdc.csv')
#Score: 76.2646

# LinearSVC with feature selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(data, Y_train)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(data)
Beispiel #8
0
def train_classifiers(train_x,
                      train_y,
                      test_x,
                      test_y,
                      articulatory=False,
                      dataset_name='',
                      classifiers=['lda'],
                      nframes_mfcc=1):
    """ train classifiers on the features to look at baseline classifications
    """
    print("size of input layer (== dimension of the features space) %d" %
          train_x.shape[1])
    ### Training a SVM to compare results TODO
    if 'sgd' in classifiers:
        ### Training a linear model (elasticnet) to compare results
        print("*** training a linear model with SGD ***")
        from sklearn import linear_model
        clf = linear_model.SGDClassifier(
            loss='modified_huber',
            penalty='elasticnet')  # TODO change and CV params
        clf.fit(train_x, train_y)
        print "score linear classifier (elasticnet, SGD trained)", clf.score(
            test_x, test_y)
        with open('linear_elasticnet_classif.pickle', 'w') as w_f:
            cPickle.dump(clf, w_f)

    if 'rf' in classifiers:
        ### Training a random forest to compare results
        print("*** training a random forest ***")
        from sklearn.ensemble import RandomForestClassifier
        clf2 = RandomForestClassifier(n_jobs=-1,
                                      max_features='log2',
                                      min_samples_split=3)
        clf2.fit(train_x, train_y)
        print "score random forest", clf2.score(test_x, test_y)

    if 'lda' in classifiers:
        print "*** training a linear discriminant classifier ***"
        from sklearn.lda import LDA
        from sklearn.metrics import confusion_matrix
        from sklearn import cross_validation

        def lda_on(train_x,
                   train_y,
                   test_x,
                   test_y,
                   feats_name='all_features'):
            """ Linear Discriminant Analysis """
            lda = LDA()
            lda.fit(train_x, train_y, store_covariance=True)
            print feats_name, "(train):", lda.score(train_x, train_y)
            print feats_name, "(test):", lda.score(test_x, test_y)
            with open(dataset_name + '_lda_classif_' + feats_name + '.pickle',
                      'w') as w_f:
                cPickle.dump(lda, w_f)
            y_pred = lda.predict(test_x)
            X_train, X_validate, y_train, y_validate = cross_validation\
                    .train_test_split(train_x, train_y, test_size=0.2,
                            random_state=0)
            lda.fit(X_train, y_train)
            print feats_name, "(validation):", lda.score(
                X_validate, y_validate)
            y_pred_valid = lda.predict(X_validate)
            cm_test = confusion_matrix(test_y, y_pred)
            cm_valid = confusion_matrix(y_validate, y_pred_valid)
            np.set_printoptions(threshold='nan')
            with open("cm_test" + feats_name + ".txt", 'w') as w_f:
                print >> w_f, cm_test
            with open("cm_valid" + feats_name + ".txt", 'w') as w_f:
                print >> w_f, cm_valid

        if articulatory:
            lda_on(train_x[:, :39 * nframes_mfcc],
                   train_y,
                   test_x[:, :39 * nframes_mfcc],
                   test_y,
                   feats_name='mfcc')
            lda_on(train_x[:, 39 * nframes_mfcc:],
                   train_y,
                   test_x[:, 39 * nframes_mfcc:],
                   test_y,
                   feats_name='arti')
        else:
            lda_on(train_x, train_y, test_x, test_y, feats_name='both')

    if 'featselec' in classifiers:
        ### Feature selection
        print("*** feature selection now: ***")
        print(" - Feature importances for the random forest classifier")
        print clf2.feature_importances
        from sklearn.feature_selection import SelectPercentile, f_classif
        # SelectKBest TODO?
        selector = SelectPercentile(f_classif, percentile=10)  # ANOVA
        selector.fit(train_x, train_y)
        print selector.pvalues_
        scores = -np.log10(selector.pvalues_)
        scores /= scores.max()
        print(" - ANOVA scoring (order of the MFCC)")
        print scores
        from sklearn.feature_selection import RFECV
        print(" - Recursive feature elimination with cross-validation w/ LDA")
        lda = LDA()
        rfecv = RFECV(estimator=lda, step=1, scoring='accuracy')
        rfecv.fit(train_x, train_y)
        print("Optimal number of features : %d" % rfecv.n_features_)
        print("Ranking (order of the MFCC):")
        print rfecv.ranking_
Beispiel #9
0
def sgd_classifiers():
    sgd = OneVsRestClassifier(linear_model.SGDClassifier())
    return sgd
def main():

    path = "../../../Herts/"
    extension = ".csv"

    numberLocations = len(files)
    locationList6 = []
    locationList3 = []
    locationList1 = []

    for i in range(0, numberLocations):

        toHoldOut = files[i]

        #day to (location to count)
        data = {}
        for dataFile in files:
            if dataFile != toHoldOut:
                filename = path + dataFile + extension
                with open(filename, 'rb') as f:
                    reader = csv.reader(f)
                    count = 0
                    for row in reader:
                        if (count > 0):
                            date = datetime.datetime.fromtimestamp(
                                float(row[0]))
                            dateString = str(date.month) + "/" + str(
                                date.day) + "/" + str(date.year)
                            if dateString in data:
                                locationCountMap = data[dateString]
                                if dataFile in locationCountMap:
                                    locationCountMap[
                                        dataFile] = locationCountMap[
                                            dataFile] + 1
                                else:
                                    locationCountMap[dataFile] = 1
                                data[dateString] = locationCountMap
                            else:
                                locationCountMap = {}
                                locationCountMap[dataFile] = 1
                                data[dateString] = locationCountMap
                        else:
                            count = 1
        # trace: "3/4/2016", "3/10/2016", "2/29/2016", "2/26/2016", "2/22/2016", "2/19/2016", "2/13/2016", "2/11/2016", "2/9/2016", "2/1/2016", "1/27/2016", "1/26/2016", "1/15/2016", "1/14/2016", "1/13/2016", "12/26/2015", "12/10/2015", "12/8/2015", "12/3/2015", "11/23/2015", "11/13/2015", "11/6/2015", "11/5/2015", "11/1/2015"
        setOfAllPrecipitationDays = [
            "3/2/2016", "2/25/2016", "2/24/2016", "2/23/2016", "2/20/2016",
            "2/16/2016", "2/15/2016", "2/10/2016", "2/8/2016", "2/5/2016",
            "2/4/2016", "2/3/2016", "1/23/2016", "1/18/2016", "1/17/2016",
            "1/16/2016", "1/12/2016", "1/10/2016", "1/4/2016", "12/31/2015",
            "12/30/2015", "12/29/2015", "12/27/2015", "12/24/2015",
            "12/23/2015", "12/22/2015", "12/18/2015", "12/17/2015",
            "12/15/2015", "12/14/2015", "12/2/2015", "12/1/2015", "11/28/2015",
            "11/22/2015", "11/20/2015", "11/19/2015", "11/12/2015",
            "11/11/2015", "11/10/2015"
        ]
        #x is a list of (list of counts), where each index in the inner lists represents a location
        x = []
        y = []
        for day in data:
            locationCountMap = data[day]
            toAdd = []
            for location in files:
                if location in locationCountMap:
                    toAdd.append(locationCountMap[location])
                else:
                    toAdd.append(0)
            x.append(toAdd)
            if day in setOfAllPrecipitationDays:
                #1 indicates a precipitation day
                y.append(1)
            else:
                #0 indicates a non-precipitation day
                y.append(0)

        #"Training" the data (which in this case is just maintaining these training pairs for later use)
        y = np.array(y)

        # K nearest neighbors
        neighbors = KNeighborsClassifier()
        neighbors.fit(x, y)

        # SGD
        sgd = linear_model.SGDClassifier()
        sgd.fit(x, y)

        # SVC
        svc = SVC()
        svc.fit(x, y)

        # Bernoulli Naive Bayes
        nb = BernoulliNB()
        nb.fit(x, y)

        # Decision tree
        decisionTree = tree.DecisionTreeClassifier()
        decisionTree.fit(x, y)

        toAdd6 = []
        toAdd3 = []
        toAdd1 = []

        scores1 = cross_validation.cross_val_score(neighbors, x, y, cv=5)
        scores2 = cross_validation.cross_val_score(sgd, x, y, cv=5)
        scores3 = cross_validation.cross_val_score(svc, x, y, cv=5)
        scores4 = cross_validation.cross_val_score(nb, x, y, cv=5)
        scores5 = cross_validation.cross_val_score(decisionTree, x, y, cv=5)

        toAdd6.append(np.mean(scores1))
        toAdd6.append(np.mean(scores2))
        toAdd6.append(np.mean(scores3))
        toAdd6.append(np.mean(scores4))
        toAdd6.append(np.mean(scores5))

        #using best 3 classifiers, svc, sgd, decision tree
        toAdd3.append(np.mean(scores3))
        toAdd3.append(np.mean(scores2))
        toAdd3.append(np.mean(scores5))

        #using best classifier, svc
        toAdd1.append(np.mean(scores3))

        averageAccuracy6 = np.mean(toAdd6)
        averageAccuracy3 = np.mean(toAdd3)
        averageAccuracy1 = np.mean(toAdd1)

        locationList6.append((averageAccuracy6, i))
        locationList3.append((averageAccuracy3, i))
        locationList1.append((averageAccuracy1, i))

    print "*****************************"
    print "Average of all 6 classifier methods:"
    clusterLocations(locationList6)

    print "*****************************"
    print "Average of best 3 classifier methods:"
    clusterLocations(locationList3)

    print "*****************************"
    print "Using only best classifier method:"
    clusterLocations(locationList1)
Beispiel #11
0
import numpy
import pandas as pd
import joblib
from sklearn import linear_model
from sklearn.model_selection import train_test_split

df = pd.read_csv(
 "https://raw.githubusercontent.com/tyler-martin-12/alexa_check_flag_skill/master/df_final.csv",index_col=0)

print(df.head())

x = df.copy().drop('label', axis=1)
y = df['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.3 , random_state=1)

lm = linear_model.SGDClassifier(alpha=.1,loss='log')
lm.fit(x_train, y_train)

joblib.dump(lm, 'model.pkl')
Beispiel #12
0
scaler = preprocessing.StandardScaler()
scaler.fit(X)

scaler.mean_
scaler.scale_

X_scaled = scaler.transform(X)

# According to the scikit-learn documentation, the following is a good guess for
# the number of iterations required to achieve convergence
n_iter = np.ceil(10**6 / X.shape[0])

# As usual, the regularisation parameter 'alpha' can be tuned using
# `grid_search.GridSearchCV`
gs = grid_search.GridSearchCV(
    estimator=lm.SGDClassifier(loss='log', penalty='l2', n_iter=n_iter),
    param_grid={'alpha': 10.0**-np.arange(1, 7)},
    scoring='roc_auc',
    cv=kf
)
gs.fit(X_scaled, y)

gs.best_estimator_

# Before using this model to predict, we'd need to call `scaler.transform` on
# the new data

# We can also put everything together in a pipeline…

sgd_pipeline = Pipeline([
    ('scale', preprocessing.StandardScaler()),
Beispiel #13
0
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
print(acc_sgd)

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)
print(Y_prediction)
import csv
with open("output_result.csv", 'w', newline='') as myfile:
plt.legend(loc='best')
plt.show

# 4:データの整形-------------------------------------------------------
X_std = X_xor
z = y_xor

#解説 5:カーネル近似を適用する------------------------------------------
rbf_feature = RBFSampler(gamma=1, n_components=100, random_state=1)

X_std = rbf_feature.fit_transform(X_std)
print("X_stdの大きさ ", pd.DataFrame(X_std).shape)
#pd.DataFrame(X_std).to_clipboard() #これでクリップボードに保持できるのでエクセルに貼れる

# 6:機械学習で分類する---------------------------------------------------
clf_result = linear_model.SGDClassifier(
    loss="hinge")  #loss="hinge", loss="log"

# 7:K分割交差検証(cross validation)で性能を評価する---------------------
scores = cross_validation.cross_val_score(clf_result, X_std, z, cv=10)
print("平均正解率 = ", scores.mean())
print("正解率の標準偏差 = ", scores.std())

# 8:トレーニングデータとテストデータに分けて実行してみる------------------
X_train, X_test, train_label, test_label = cross_validation.train_test_split(
    X_std, z, test_size=0.1, random_state=1)
clf_result.fit(X_train, train_label)
#正答率を求める
pre = clf_result.predict(X_test)
ac_score = metrics.accuracy_score(test_label, pre)
print("正答率 = ", ac_score)
    start = time.time()
    estimator.fit(X_train, y_train)

    fit_time = time.time() - start
    n_iter = estimator.n_iter_
    train_score = estimator.score(X_train, y_train)
    test_score = estimator.score(X_test, y_test)

    return fit_time, n_iter, train_score, test_score


# Define the estimators to compare
estimator_dict = {
    'No stopping criterion':
    linear_model.SGDClassifier(tol=1e-3, n_iter_no_change=3),
    'Training loss':
    linear_model.SGDClassifier(early_stopping=False,
                               n_iter_no_change=3,
                               tol=0.1),
    'Validation score':
    linear_model.SGDClassifier(early_stopping=True,
                               n_iter_no_change=3,
                               tol=0.0001,
                               validation_fraction=0.2)
}

# Load the dataset
X, y = load_mnist(n_samples=10000)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
def predict(dir, name, validate):

    base = "%s/%s" % (dir, name)
    negpath = "%strain_neg%s.txt" % (base, suffix)
    pospath = "%strain_pos%s.txt" % (base, suffix)
    testpath = "%stest_data.txt" % (base)

    if validate:
        printer("Validating " + base)
    else:
        printer("Predicting " + base)

    printer("Reading train tweets...")

    negtweets = [[0, t, -1] for t in p.read_tweets(negpath)]
    postweets = [[0, t, 1] for t in p.read_tweets(pospath)]
    testweets = [[0, t, 0] for t in p.read_tweets(testpath)]

    printer("Reading test tweets...")
    #testtweets = pd.DataFrame.from_records(p.process_testdata(testpath), columns=["ind","tweet"])
    #testtweets["label"] = 0

    printer("Processing data...")
    data = pd.DataFrame(testweets + negtweets + postweets,
                        columns=["ind", "tweet", "label"])

    printer("Vectorising data...")
    featureCount = 0
    if useVectoriser:

        count_vectorizer = CountVectorizer(
            preprocessor=preprocessor,
            ngram_range=(1, 3),
            min_df=3,
            lowercase=True,
            binary=False,
            token_pattern=r'(?u)(?<=\s)\S+(?=\s)')
        vec_data = count_vectorizer.fit_transform(data["tweet"])

        features = count_vectorizer.get_feature_names()
        featureCount = len(features)
        printer(features[:60])
        printer("Found " + str(featureCount) + " features")

    else:

        data["norm"] = [line.split(' ') for line in data["tweet"]]
        vec_data = bf.vectorise(data["norm"])

    printer("Vectorized, learning...")
    if validate:

        vec_train, vec_test, labels_train, labels_test = train_test_split(
            vec_data[10000:],
            data["label"][10000:],
            test_size=0.25,
            random_state=1)

    else:

        vec_train = vec_data[10000:]
        labels_train = data["label"][10000:]
        vec_test = vec_data[:10000]

    # printer("Start MLP\n")
    # clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(128, 64), random_state=1)

    printer("Start SGD\n")
    clf = linear_model.SGDClassifier(shuffle=True,
                                     max_iter=10000,
                                     tol=0.0001,
                                     loss='hinge',
                                     penalty='l2',
                                     alpha=0.0001)

    printer("Predicting...")
    clf_output = clf.fit(vec_train, labels_train)

    # predict data
    ans = clf.predict(vec_test)
    pred = clf.decision_function(vec_test)

    if validate:

        printer(metrics.classification_report(labels_test, ans))
        score = metrics.f1_score(labels_test, ans)
        return (setting, score, featureCount)

    else:

        # save data
        res = pd.DataFrame(ans)
        res.index += 1
        res.to_csv(path_or_buf="%ssubmission.csv" % name,
                   index=True,
                   index_label="Id",
                   header=["Prediction"])

        return ((setting, 0, featureCount))
# Choose model
# from sklearn import gaussian_process
# Gaussian = gaussian_process.GaussianProcess(theta0=1e-2, thetaL=1e-4, thetaU=1e-1)
# GaussianProcessRegressor
# from sklearn import metrics
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
svm = SVC(kernel='linear')
from sklearn.svm import LinearSVC
svmLinear = LinearSVC()
from sklearn import tree
cartTree = tree.DecisionTreeClassifier()

linear_square = lm.SGDClassifier(loss='squared_loss',
                                 penalty='none',
                                 max_iter=maxIter,
                                 tol=tolerance)
ridge = lm.SGDClassifier(loss='squared_loss',
                         penalty='l2',
                         max_iter=maxIter,
                         tol=tolerance,
                         alpha=0.5)
# ridgel1 = lm.SGDClassifier(loss='squared_loss', penalty='l2', max_iter=maxIter, tol=tolerance)
lasso = lm.SGDClassifier(loss='squared_loss',
                         penalty='l1',
                         max_iter=maxIter,
                         tol=tolerance)
logisitc = lm.LogisticRegression()
# bayes = lm.BayesianRidge()

# Bagging
 def setSGD(self):
     self.classifier = linear_model.SGDClassifier(loss="log")
     print "Using SGD classifier"
Beispiel #19
0
def main():

    #November 1st 2015 to Marth 10th 2016
    #1447306774
    #Thu, 12 Nov 2015 05:39:34 GMT
    path = "../../../Herts/"
    extension = ".csv"

    #	numberLocations = len(files)
    #	locationList6 = []
    #	locationList3 = []
    #	locationList1 = []

    #	for i in range(0, numberLocations):

    #		toHoldOut = files[i]

    #day to (location to count)
    data = {}
    #earliestDay = float("inf")
    #latestDay = float("-inf")
    for dataFile in files:
        #			if dataFile != toHoldOut:
        filename = path + dataFile + extension
        with open(filename, 'rb') as f:
            reader = csv.reader(f)
            count = 0
            for row in reader:
                if (count > 0):
                    date = datetime.datetime.fromtimestamp(float(row[0]))
                    dateString = str(date.month) + "/" + str(
                        date.day) + "/" + str(date.year)
                    #thisDay = date.year * 10000 + date.month * 100 + date.day
                    #if(thisDay < earliestDay):
                    #	earliestDay = thisDay
                    #if(thisDay > latestDay):
                    #	latestDay = thisDay
                    if dateString in data:
                        locationCountMap = data[dateString]
                        if dataFile in locationCountMap:
                            locationCountMap[
                                dataFile] = locationCountMap[dataFile] + 1
                        else:
                            locationCountMap[dataFile] = 1
                        data[dateString] = locationCountMap
                    else:
                        locationCountMap = {}
                        locationCountMap[dataFile] = 1
                        data[dateString] = locationCountMap
                else:
                    count = 1

    #print earliestDay
    #print latestDay
    #setOfAllSchoolDays = ["11/2/2015", "11/3/2015", "11/4/2015", "11/5/2015", "11/6/2015", "11/9/2015", "11/10/2015", "11/11/2015", "11/12/2015", "11/13/2015", "11/16/2015", "11/17/2015", "11/18/2015", "11/19/2015", "11/20/2015", "11/23/2015", "11/24/2015", "11/30/2015", "12/1/2015", "12/2/2015", "12/3/2015", "12/4/2015", "12/7/2015", "12/8/2015", "12/9/2015", "12/10/2015", "12/11/2015", "1/27/2016", "1/28/2016", "1/29/2016", "2/1/2016", "2/2/2016", "2/3/2016", "2/4/2016", "2/5/2016", "2/9/2016", "2/10/2016", "2/11/2016", "2/12/2016", "2/15/2016", "2/16/2016", "2/17/2016", "2/18/2016", "2/19/2016", "2/24/2016", "2/25/2016", "2/26/2016", "2/29/2016", "3/1/2016", "3/2/2016", "3/3/2016", "3/4/2016", "3/7/2016", "3/8/2016", "3/9/2016", "3/10/2016"]
    setOfAllPrecipitationDays = [
        "3/4/2016", "3/10/2016", "2/29/2016", "2/26/2016", "2/22/2016",
        "2/19/2016", "2/13/2016", "2/11/2016", "2/9/2016", "2/1/2016",
        "1/27/2016", "1/26/2016", "1/15/2016", "1/14/2016", "1/13/2016",
        "12/26/2015", "12/10/2015", "12/8/2015", "12/3/2015", "11/23/2015",
        "11/13/2015", "11/6/2015", "11/5/2015", "11/1/2015", "3/2/2016",
        "2/25/2016", "2/24/2016", "2/23/2016", "2/20/2016", "2/16/2016",
        "2/15/2016", "2/10/2016", "2/8/2016", "2/5/2016", "2/4/2016",
        "2/3/2016", "1/23/2016", "1/18/2016", "1/17/2016", "1/16/2016",
        "1/12/2016", "1/10/2016", "1/4/2016", "12/31/2015", "12/30/2015",
        "12/29/2015", "12/27/2015", "12/24/2015", "12/23/2015", "12/22/2015",
        "12/18/2015", "12/17/2015", "12/15/2015", "12/14/2015", "12/2/2015",
        "12/1/2015", "11/28/2015", "11/22/2015", "11/20/2015", "11/19/2015",
        "11/12/2015", "11/11/2015", "11/10/2015"
    ]
    #x is a list of (list of counts), where each index in the inner lists represents a location
    x = []
    y = []
    for day in data:
        locationCountMap = data[day]
        toAdd = []
        for location in files:
            if location in locationCountMap:
                toAdd.append(locationCountMap[location])
            else:
                toAdd.append(0)
        x.append(toAdd)
        if day in setOfAllPrecipitationDays:
            #1 indicates a school day
            y.append(1)
        else:
            #0 indicates a non-school day
            y.append(0)

    #"Training" the data (which in this case is just maintaining these training pairs for later use)
    y = np.array(y)

    # K nearest neighbors
    neighbors = KNeighborsClassifier()
    neighbors.fit(x, y)

    # SGD
    sgd = linear_model.SGDClassifier()
    sgd.fit(x, y)

    # SVC
    svc = SVC()
    svc.fit(x, y)

    # Bernoulli Naive Bayes
    nb = BernoulliNB()
    nb.fit(x, y)

    # Decision tree
    decisionTree = tree.DecisionTreeClassifier()
    decisionTree.fit(x, y)

    scores1 = cross_validation.cross_val_score(neighbors, x, y, cv=5)
    scores2 = cross_validation.cross_val_score(sgd, x, y, cv=5)
    scores3 = cross_validation.cross_val_score(svc, x, y, cv=5)
    scores4 = cross_validation.cross_val_score(nb, x, y, cv=5)
    scores5 = cross_validation.cross_val_score(decisionTree, x, y, cv=5)

    print "Neighbors:"
    print np.mean(scores1)
    print "SGD:"
    print np.mean(scores2)
    print "SVM:"
    print np.mean(scores3)
    print "NB:"
    print np.mean(scores4)
    print "Tree:"
    print np.mean(scores5)
import numpy as np
from sklearn import linear_model
from sklearn import tree
from sklearn import ensemble
from sklearn import svm

MODELS = {
    "SVC": svm.SVC(kernel="linear"),
    "SGDClassifier": linear_model.SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)
}
#!/usr/bin/env python
# uses GridSearchCV to optimize SVM
import numpy as np
import sklearn.linear_model as lm
import sklearn.grid_search as gs
import lib.loader as ld
import sklearn.feature_extraction.text as tfidf

# training data
trainx, trainy = ld.loadtrain('data/trainingdata.txt')
trainx2 = tfidf.TfidfTransformer().fit_transform(trainx)
parameters = {'alpha': [10**i for i in np.arange(-5, -2, 0.2)], 
        'loss': ['hinge', 'log']}
mdl = lm.SGDClassifier()
clf = gs.GridSearchCV(mdl, parameters, n_jobs=-1, cv=5)
clf.fit(trainx2.toarray(), trainy)

# print results
print(clf.best_score_)              # best score
print(clf.best_params_)             # best params

parameters = {'alpha': [10**i for i in np.arange(-3, 0, 0.2)], 
        'loss': ['hinge', 'log']}
mdl = lm.SGDClassifier()
clf = gs.GridSearchCV(mdl, parameters, n_jobs=-1, cv=5)
clf.fit(trainx2.toarray(), trainy)

# print results
print(clf.best_score_)              # best score
print(clf.best_params_)             # best params
Beispiel #22
0
# y_XXX_all_labels stores the label rank results of each review
X_train, X_test, y_train_all_labels, y_test_all_labels = train_test_split(
    X, y_array, test_size=0.2, random_state=42)

#now we only take the first label(real label) to do classification
y_train = np.array(y_train_all_labels)[:, 0].tolist()
y_test = np.array(y_test_all_labels)[:, 0].tolist()

#---------------------------Start Training------------------------------------

if (classifier_type == 'svm'):
    #    classifier = OneVsRestClassifier(svm.SVC(kernel='linear', C=1, probability=True, random_state=0))
    classifier = OneVsRestClassifier(
        linear_model.SGDClassifier(max_iter=500,
                                   tol=1e-3,
                                   random_state=21,
                                   warm_start=warm_start_set))

if (classifier_type == 'mlp'):
    classifier = MLPClassifier(hidden_layer_sizes=(100, 100),
                               max_iter=500,
                               alpha=0.0001,
                               solver='adam',
                               verbose=0,
                               random_state=21)
#                     warm_start=warm_start_set)

sys.exit(0)
print("Strat training...")
classifier.fit(X_train, y_train)
#sys.exit(0)
Beispiel #23
0
def main():

    t0 = time.time()

    ## 1.b) Load and convert datasets                                              start
    #
    #    #Get cvs paths
    #    relation_path = r"C:\\Users\\V\\Desktop\\UW Vidal\\Winter 18\\TCSS455 Introduction to Machine Learning\\Project\\training\\relation\\relation.csv"
    #    profile_path = r"C:\\Users\\V\\Desktop\\UW Vidal\\Winter 18\\TCSS455 Introduction to Machine Learning\\Project\\training\\profile\\profile.csv"
    #
    #    #Convert csv into pandas DataFrame
    #    relation_df = pd.read_csv(relation_path)
    #    profile_df = pd.read_csv(profile_path)

    # 2. Summarize Data ###########################################################
    # 2.a) Descriptive statistics
    #    print(profile_df.describe())
    #    pd.set_option('display.width', 100)
    #    pd.set_option('precision', 3)
    #    correlations = profile_df.corr(method='pearson')
    #    print(correlations)
    #    profile_df.hist()
    #    pyplot.show()
    #
    #
    #
    ## 3. Prepare Data #############################################################
    ## a) Data Cleaning
    ## b) Feature Selection
    ## c) Data Transforms
    #
    #
    #    userid_col = relation_df[['userid']]
    #    row_counter = 1
    #    num_users = 1
    #    userid_dict = {}
    #
    #    #put all userids' in a dictionary
    #    for index, row in userid_col.iterrows():
    #        l = row.tolist()
    #        userid = l[0].strip()
    #        if (userid not in userid_dict):
    #            userid_dict[userid] = ""
    #            num_users += 1
    #
    #        row_counter += 1
    #        if (row_counter < -2000): #15change
    #            break
    #
    #
    #    #relation_head = relation_df.head(2000) #25change
    #    #profile_head = profile_df.head(2000) #35change
    #    #head_profile.to_csv("head.csv", sep=',')
    #
    #
    #    print("Here now")
    #
    #    #combine all likeids' associated with a userid
    #    #make this the value of the userid in the dictionary
    #    for index, row in relation_df.iterrows():#45change
    #
    #        row_list = row.tolist()
    #
    #        userid = str(row_list[1])
    #        user_vals = userid_dict[userid]
    #        userid_dict[userid] = user_vals + " " + str(row_list[2])
    #
    #    t_df = pd.DataFrame.from_dict(userid_dict, orient='index')
    #    t_df = t_df.reset_index() ## remember to reassign when calling a function
    #    t_df.columns = ["userid", "likes"]
    #
    #    merge_df = pd.merge(t_df, profile_df, on="userid") #55change                 end
    merge_df = pd.read_csv('merged.csv', sep=',')

    #    pd.set_option('display.width', 100)
    #    pd.set_option('precision', 3)
    #    correlations = merge_df.corr(method='pearson')

    # 4. Evaluate Algorithms ######################################################
    # a) Split-out validation dataset
    X = merge_df['likes']
    y_gender = merge_df['gender']
    y_age = merge_df['age']

    #age convert
    y_age = y_age.apply(convert_age_to_class)
    y.to_csv("age_classified.csv", sep=',')

    #general algorithm
    valida_size = 0.20
    seed = 7
    X_train, X_validation, y_train, y_validation = train_test_split(
        X, y, test_size=valida_size, random_state=seed)
    #
    ## b) Test options and evaluation metric
    #
    count_vect1 = CountVectorizer()
    X_train = count_vect1.fit_transform(X_train)  #X_train is sparse.csr_matrix
    ##    print(type(X_train))
    ##    print(X_train.shape)
    ##    print(count_vect1.get_feature_names())
    count_vect2 = CountVectorizer(vocabulary=count_vect1.vocabulary_)
    X_validation = count_vect2.fit_transform(X_validation)
    ##    print(type(X_validation))
    ##    print(X_validation.shape)
    #
    #    kNN = KNeighborsClassifier()
    #    bNB_clf = BernoulliNB()
    #    dt_clf = DecisionTreeClassifier(random_state=0)
    #    lr_clf = LogisticRegression(random_state=seed)
    #    sgd_clf = linear_model.SGDClassifier(max_iter=10, learning_rate='optimal', random_state=seed)
    #    mNB_clf = MultinomialNB()
    #
    #    enclf = VotingClassifier(estimators=[('dt', dt_clf),('lr', lr_clf), ('sgd', sgd_clf), ('mNB', mNB_clf)], voting='hard')
    #
    #    dt_clf.fit(X_train, y_train)
    #    bNB_clf.fit(X_train, y_train)
    #    lr_clf.fit(X_train, y_train)
    #    sgd_clf.fit(X_train, y_train)
    #    mNB_clf.fit(X_train, y_train)
    #    enclf = enclf.fit(X_train, y_train)
    #
    #    print("Here")
    #
    #    results_dt = dt_clf.predict(X_validation)
    #    results_bNB = bNB_clf.predict(X_validation)
    #    results_lr = lr_clf.predict(X_validation)
    #    results_sgd = sgd_clf.predict(X_validation)
    #    results_nMB = mNB_clf.predict(X_validation)
    #    results_enclf = enclf.predict(X_validation)
    #
    #    print("results_bNB")
    #    print(accuracy_score(y_validation, results_dt))
    #    print(confusion_matrix(y_validation, results_dt))
    #    print(classification_report(y_validation, results_dt))
    #    print("results_bNB")
    #    print(accuracy_score(y_validation, results_bNB))
    #    print(confusion_matrix(y_validation, results_bNB))
    #    print(classification_report(y_validation, results_bNB))
    #    print("results_lr")
    #    print(accuracy_score(y_validation, results_lr))
    #    print(confusion_matrix(y_validation, results_lr))
    #    print(classification_report(y_validation, results_lr))
    #    print("results_sgd")
    #    print(accuracy_score(y_validation, results_sgd))
    #    print(confusion_matrix(y_validation, results_sgd))
    #    print(classification_report(y_validation, results_sgd))
    #    print(results_lr)
    #    print(results_sgd)
    #    print()
    #    print("results_mNB")
    #    print(accuracy_score(y_validation, results_nMB))
    #    print(confusion_matrix(y_validation, results_nMB))
    #    print(classification_report(y_validation, results_nMB))
    #
    #    print("results_enclf")
    #    print(accuracy_score(y_validation, results_enclf))
    #    print(confusion_matrix(y_validation, results_enclf))
    #    print(classification_report(y_validation, results_enclf))
    ## c) Spot Check Algorithms
    #    models = []
    #    models.append(('multiNB', MultinomialNB()))
    #    models.append(('bernoulliNB', BernoulliNB()))
    #    models.append(('kNN', KNeighborsClassifier()))
    #    models.append(('LogReg', LogisticRegression()))
    #    models.append(('SGD', linear_model.SGDClassifier(max_iter=10, learning_rate='optimal', random_state=seed)))
    ##
    ##
    ##    print()
    #    print('Comparing Algorithms')
    #    results = []
    #    names = []
    #    for name, model in models:
    #        kfold = KFold(n_splits=10, random_state = seed)
    #        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    #        results.append(cv_results)
    #        names.append(name)
    #        print("%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()))
    #
    ## d) Compare Algorithms
    #    fig = pyplot.figure()
    #    fig.suptitle('Algorithm Comparison')
    #    ax = fig.add_subplot(111)
    #    pyplot.boxplot(results)
    #    ax.set_xticklabels(names)
    #    pyplot.show()   'learning_rate':('optimal', 'constant', 'invscaling')

    #'learning_rate':('optimal', 'constant', 'invscaling') 'learning_rate':('optimal', 'constant', 'invscaling')
    parameters = {'max_iter': (1, 5, 10, 20), 'shuffle': (True, False)}
    sgd = linear_model.SGDClassifier()
    clf = GridSearchCV(sgd, parameters)
    clf.fit(X_train, y_train)
    print(clf.cv_results_)
    #    lR = LogisticRegression()
    #    lR.fit(X_train,y_train)
    #    count_vect_val = CountVectorizer()
    #    print("Here")
    ##    X_validation = count_vect_val.fit_transform(X_validation)
    #    predictions = lR.predict(X_validation)
    #    print(accuracy_score(y_validation, predictions))
    #    print(confusion_matrix(y_validation, predictions))
    #    print(classification_report(y_validation, predictions))
    #
    #    t1 = time.time()
    #    print("\n\n--- %s seconds ---" % (t1-t0))
    #    winsound.Beep(500,1000)
    print("Done")
Beispiel #24
0
    df1 = pd.DataFrame()
    for i in range(0, a):
        txt = deepcopy(df["text"][i])
        txt1 = re.sub("[^a-zA-Z]", " ", txt)
        txt2 = txt1.lower().split()
        txt3 = [j for j in txt2 if not j in content]
        txt4 = " ".join(txt3)
        df1 = df1.append([[i, txt4, df["polarity"][i]]], ignore_index=True)
    df1.columns = ['row_number', 'text', 'polarity']

    voc = []
    for i in xrange(0, a):
        voc.append(df1["text"][i])

    vectorizer1 = CountVectorizer(max_features=5000)
    SGD1 = linear_model.SGDClassifier(loss='hinge', penalty='l1')
    unigram = vectorizer1.fit_transform(voc)
    unigram_model = SGD1.fit(unigram.toarray(), df1['polarity'])

    vectorizer2 = CountVectorizer(ngram_range=(1, 2), max_features=5000)
    SGD2 = linear_model.SGDClassifier(loss='hinge', penalty='l1')
    bigram = vectorizer2.fit_transform(voc)
    bigram_model = SGD2.fit(bigram.toarray(), df1['polarity'])

    tfidf1 = TfidfTransformer(smooth_idf=False)
    SGD3 = linear_model.SGDClassifier(loss='hinge', penalty='l1')
    unigram_tfidf = tfidf1.fit_transform(unigram.toarray())
    unigramtfidf_model = SGD3.fit(unigram_tfidf.toarray(), df1['polarity'])

    tfidf2 = TfidfTransformer(smooth_idf=False)
    SGD4 = linear_model.SGDClassifier(loss='hinge', penalty='l1')
MLA = [
    #ensemble method
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #gaussian processes
    gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #naive_bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    #nearest neighbours
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),

    #trees
    tree.DecisionTreeClassifier(),
]
Beispiel #26
0
    L = mat['L'][0][0]
    y = D[:, :L]
    X = D[:, L:]

    print(X.shape)
    print(y.shape)

    n_instances = X.shape[0]
    n_features = X.shape[1]
    n_labels = y.shape[1]

    classifiers = []

    for j in range(n_labels):
        classifier = linear_model.SGDClassifier(loss='hinge',
                                                tol=1e-3,
                                                max_iter=1)
        classifier.partial_fit([X[0, :]], [y[0, j]], classes=[0, 1])
        classifiers.append(classifier)

    predictions = np.zeros((n_instances - 1, n_labels))
    probabilities = np.zeros((n_instances - 1, n_labels))
    truth = y[1:, :]
    # truth = np.array(y[1:, :].todense())

    # Initialize adwin detector
    adwin = AdWin()

    # Start online learning
    for i in range(1, n_instances):
        cur_probs = np.zeros(n_labels)
Beispiel #27
0
        regression(light_reg.SAGARegressor(random_state=RANDOM_SEED)),
        regression(light_reg.SAGRegressor(random_state=RANDOM_SEED)),
        regression(light_reg.SDCARegressor(random_state=RANDOM_SEED)),

        # Sklearn Linear Classifiers
        classification(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
        classification(
            linear_model.PassiveAggressiveClassifier(
                random_state=RANDOM_SEED)),
        classification(linear_model.Perceptron(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification(linear_model.RidgeClassifierCV()),
        classification(linear_model.SGDClassifier(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.LogisticRegression(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.LogisticRegressionCV(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.PassiveAggressiveClassifier(
                random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.Perceptron(random_state=RANDOM_SEED)),
        classification_binary(
            linear_model.RidgeClassifier(random_state=RANDOM_SEED)),
        classification_binary(linear_model.RidgeClassifierCV()),
        classification_binary(
            linear_model.SGDClassifier(random_state=RANDOM_SEED)),
Beispiel #28
0
def generate_model(custid, last_post_time):
    # custid=17967798
    #x = datetime.now()
    #today_date=str(x).split(" ")[0]
    #yesterday=datetime.strftime(datetime.now() - timedelta(1), '%Y-%m-%d')
    docs = mdb[user_rating_collection].find(
        {
            "custid": custid,
            "seen_unix_time": {
                '$gt': last_post_time
            }
        }, {
            "category": 1,
            "keywords": 1,
            "rating": 1,
            "seen_unix_time": 1
        })
    #docs=mdb[user_rating_collection].find({"custid" : custid,"date" :{'$nin':["2019-03-07","2019-03-06"]}},{"category":1,"keywords":1,"rating":1})
    X = []
    y = []
    new_last_post_time = 0
    for doc in docs:
        #print(doc)
        s = ''
        if "category" in doc and doc["category"]:
            s = s + cat[
                doc["category"]] if doc["category"] in cat else doc["category"]
        if "keywords" in doc and doc["keywords"]:
            s = s + " " + " ".join(doc["keywords"])
        X.append(s)
        y.append(doc["rating"])
        #print(custid,doc)
        new_last_post_time = doc['seen_unix_time']
    if len(X) > 0:
        vectorizer = HashingVectorizer()
        vect_X = vectorizer.transform(X)
        #vectorizer = TfidfVectorizer()
        #vectorizer = CountVectorizer()
        #tfidf_model=vectorizer.fit_transform(X)
        #test_X=vectorizer.transform(tX)

        #from sklearn.linear_model import LogisticRegression
        #classifier = LogisticRegression(max_iter=500,random_state = 0,multi_class="multinomial",solver="lbfgs",penalty="l2")
        #classifier = LogisticRegression(max_iter=500,C=1000,solver='lbfgs')

        #classifier.fit(tfidf_model, y)
        #model=Pipeline([("tfidf",vectorizer),("lr",classifier)])
        #model=joblib.load("../models/"+str(custid))
        #model.partial_fit(vect_X,y)
        try:
            model = joblib.load("../models/" + str(custid))
            model.partial_fit(vect_X, y)
            joblib.dump(model, "../models/" + str(custid))
            print(str(custid) + "  partial_fit")
        except:
            if len(set(y)) > 1:
                print(str(custid) + " normal fit")
                model = linear_model.SGDClassifier(loss='log',
                                                   penalty="l2",
                                                   n_iter=500,
                                                   n_jobs=-1)
                model.fit(vect_X, y)
                joblib.dump(model, "../models/" + str(custid))
            else:
                print(
                    str(custid) +
                    " The number of classes has to be greater than one; got 1 class"
                )
        #joblib.dump(vectorizer, "../models/vect_"+str(custid))
        print(custid + " last post time" + new_last_post_time)
        mdb[active_users_collection].update_one({"_id": custid}, {
            '$set': {
                'updated': 1,
                'up_time': time.time(),
                'last_post_time': new_last_post_time
            }
        })
    else:
        print(str(custid) + " don't have atleast 1 flips")
        print(custid + " last post time" + new_last_post_time)
        mdb[active_users_collection].update_one(
            {"_id": custid}, {'$set': {
                'updated': 1,
                'up_time': time.time()
            }})
Beispiel #29
0
                          inplace=False,
                          kind='quicksort',
                          na_position='last')
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]

preprocessed_reviews = []
for sentence in final['Text'].values:
    preprocessed_reviews.append(clean_text(sentence))

count_vect = CountVectorizer()
count_vect.fit(preprocessed_reviews)
joblib.dump(count_vect, 'count_vect.pkl')
X = count_vect.transform(preprocessed_reviews)
print(X.shape)
Y = final['Score'].values
clf = linear_model.SGDClassifier(max_iter=1000,
                                 tol=1e-3,
                                 eta0=0.1,
                                 alpha=0.001)
clf.fit(X, Y)
joblib.dump(clf, 'model.pkl')

print(
    predict(
        'Have been having this since years. Much better option than Bru.Nescafe still managing to do well in market with '
        'all the competitors breathing down it\'s neck. Good one!'))

print(
    predict(
        'The Magic behind that massal I found is nothing but Artificial Food colour. Dear ITC , please provide Natural and safe food to your beloved Indians. Even though the food colouring is approved but try to avoid such artificial food even in small traces and make the young Indian more stronger.'
    ))
    stopword=True,
    more_stopwords=None,
    spellcheck=False,
    stemming=True,
    remove_numbers=True,
    deasciify=True,
    remove_punkt=True,
    lowercase=True,
    wordngramrange=(1, 2),
    charngramrange=(2, 2),
    nmaxfeature=None,
    norm="l2",
    use_idf=True,
    classifier=sklinear.SGDClassifier(loss='hinge',
                                      penalty='l2',
                                      alpha=1e-3,
                                      n_iter=5,
                                      random_state=42),
)


def tr_sentiment_analysis(lang, weights, stopword, more_stopwords, spellcheck,
                          stemming, remove_numbers, deasciify, remove_punkt,
                          lowercase, wordngramrange, charngramrange,
                          nmaxfeature, norm, use_idf, classifier,
                          train_data_folder, train_data_fname, text_col,
                          cat_col, csvsep, shuffle_dataset,
                          cross_val_performance, modelfolder, modelname):

    conf_sentiment = prepconfig.FeatureChoice(
        lang, weights, stopword, more_stopwords, spellcheck, stemming,