Exemple #1
0
def learnPhase():
    if os.path.isfile("Doc2VecSVMNauceni.pkl"):
        return None
    tablecolrow = loadData("train.csv")
    tablecolrow[3] = FilterQuestions(tablecolrow[3])
    tablecolrow[4] = FilterQuestions(tablecolrow[4])

    model = prepareDoc2Vec(tablecolrow[3], tablecolrow[4])

    for i in range(len(tablecolrow[3])):
        tablecolrow[3][i] = model.infer_vector(tablecolrow[3][i].split(" "))
        tablecolrow[4][i] = model.infer_vector(tablecolrow[4][i].split(" "))

    traindataX = [None] * len(tablecolrow[3])
    traindataY = [None] * len(tablecolrow[3])
    for i in range(len(traindataX)):
        traindataX[i] = tablecolrow[3][i] + tablecolrow[4][i]
        traindataY[i] = int(tablecolrow[5][i])

    svmKlasifikator = SVC(kernel='rbf',
                          verbose=True,
                          probability=True,
                          max_iter=1000000)
    print("Learning started")
    tmStart = timer()
    svmKlasifikator.fit(traindataX, traindataY)
    tmEnd = timer()
    print("Predicting lasted", tmEnd - tmStart)
    joblib.dump(svmKlasifikator, 'Doc2VecSVMNauceni.pkl')
    print("Spremljen je napredak ucenja")
class SVMLearner(RLRLearner):
    def __init__(self, data_model):
        super().__init__(data_model)

        self.svm = SVC(kernel='linear', probability=True, tol=0.0001)

    def fit_transform(self, pairs, y):
        y = numpy.array(y)
        if not y.any() and self.y.any():
            random_pair = random.choice(self.candidates)
            exact_match = (random_pair[0], random_pair[0])
            pairs = pairs + [exact_match]
            y = numpy.concatenate([y, [1]])
        elif numpy.count_nonzero(y) == len(y) and numpy.count_nonzero(
                self.y) == len(self.y):
            random_pair = random.choice(self.candidates)
            pairs = pairs + [random_pair]
            y = numpy.concatenate([y, [0]])

        super().fit_transform(pairs, y)

    def fit(self, X, y):
        self.y = y
        self.X = X
        self.svm.fit(X, y)

    def predict_proba(self, examples):
        return self.svm.predict_proba(examples)[:, 1].reshape(-1, 1)
Exemple #3
0
def learnModel(data):
    if os.path.isfile("BagOfWordsSVMNauceni.pkl"):
        return None
    data[0] = FilterQuestions(data[0])
    data[1] = FilterQuestions(data[1])
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.  
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 20000) 
    
    allQuestions = data[0] + data[1]

    vectorizer.fit(allQuestions)    
    joblib.dump(vectorizer, 'BagOfWordsVectorizerNauceni.pkl') 
    
    znacajkePitanja = [vectorizer.transform(data[0]), vectorizer.transform(data[1])]
    for i, r in enumerate(data[2]):
        data[2][i] = int(r)
        
    znacajkePitanja = hstack(znacajkePitanja).tocsr()
    svmKlasifikator = SVC(kernel='rbf', verbose=True, probability=True, max_iter=1000000)
   
    print("Learning started")
    tmStart = timer()
    svmKlasifikator.fit(znacajkePitanja, data[2])
    tmEnd = timer()
    print("Learning ended")
    print("Learning lasted", tmEnd - tmStart)
    
    joblib.dump(svmKlasifikator, 'BagOfWordsSVMNauceni.pkl') 
    print("Spremljen je napredak ucenja")
Exemple #4
0
class SVCImpl():

    def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight='balanced', verbose=False, max_iter=(- 1), decision_function_shape='ovr', random_state=None):
        self._hyperparams = {
            'C': C,
            'kernel': kernel,
            'degree': degree,
            'gamma': gamma,
            'coef0': coef0,
            'shrinking': shrinking,
            'probability': probability,
            'tol': tol,
            'cache_size': cache_size,
            'class_weight': class_weight,
            'verbose': verbose,
            'max_iter': max_iter,
            'decision_function_shape': decision_function_shape,
            'random_state': random_state}

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def predict(self, X):
        return self._sklearn_model.predict(X)

    def predict_proba(self, X):
        return self._sklearn_model.predict_proba(X)
Exemple #5
0
class SVMClassifier(ClassifierI):

    """Wrapper for scikit-learn svm classifier."""

    def __init__(self, C=1.0, kernel='rbf', degree=3, gamma='auto', coef0=0.0,
                 shrinking=True, probability=False, tol=1e-3, cache_size=200,
                 class_weight=None, verbose=False, max_iter=-1,
                 decision_function_shape=None, random_state=None):
        """Init. See scikit-learn."""
        self._clf = SVC(C=1, kernel=kernel, degree=degree, gamma=gamma,
                        coef0=coef0, shrinking=shrinking,
                        probability=probability, tol=tol, cache_size=cache_size,
                        class_weight=class_weight, verbose=verbose,
                        max_iter=max_iter,
                        decision_function_shape=decision_function_shape,
                        random_state=random_state)
        self.classes_ = None

    def __repr__(self):
        return "<SVMClassifier(%r)>" % self._clf

    def classify_many(self, vectors):
        """Classify a batch of verbs.

        :param vectors: An doc term array of vectors
        :return: The predicted class label for each input sample.
        :rtype: list
        """
        classes = self.classes_
        return [classes[i] for i in self._clf.predict(vectors)]

    def prob_classify_many(self, vectors):
        """Compute per-class probabilities for a batch of samples.
        :param vectors: A doc term array of vectors
        :rtype: list of ``ProbDistI``
        """
        y_proba_list = self._clf.predict_proba(vectors)
        return [self._make_probdist(y_proba) for y_proba in y_proba_list]

    def labels(self):
        """The class labels learned by this classifier.
        :rtype: list
        """
        return list(self.classes_)

    def train(self, vectors, labels):
        """
        Train (fit) the scikit-learn svm classifier.
        :param vectors: a doc-term array of vectors to learn from
        :param labels: a list of labels corresponding to the rows
        of the doc term array.
        """
        self.classes_, labels = np.unique(labels, return_inverse=True)
        self._clf.fit(vectors, labels)

        return self

    def _make_probdist(self, y_proba):
        classes = self.classes_
        return dict((classes[i], p) for i, p in enumerate(y_proba))
Exemple #6
0
class CreateSVC(CreateLinearSVC):
    def fit(self, data, args):
        self.model = SVC(probability=True)

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
Exemple #7
0
def svm_train(X, y, model_path):
    model = SVC()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
Exemple #8
0
def train_svm(params, suffix, train_X, train_Y, test_X, test_Y):
    C = params['C']
    kernel = params['kernel']
    model = SVC(gamma='scale', probability=True, C=C, kernel=kernel)
    print("Params C:", C, "kernel:", kernel)
    model.fit(train_X, train_Y)
    print("Train score", model.score(train_X, train_Y))
    test_score = model.score(test_X, test_Y)
    print("Test score", test_score)
    return test_score, None
Exemple #9
0
def phoneAccelerometerISVM():
    print("Loading data...")
    data = pd.read_csv("./Train_Phone-Acc-nexus4_1-a.csv")
    print("Done!")

    # Parse data and make bike vs not-biking classification using an SVM.
    # Note: I'm assuming a window width of 500
    print("Finding time series windows indexes for each class kind...")
    previousClassLabel = str(data.get_value(data.index[0], 'gt'))
    pos = 0
    y = []
    X = []
    window = 500
    while pos < data.shape[0]:
        # Make y label.
        if str(data.iloc[pos]['gt']) == 'sit':
            y.append(1)
        else:
            y.append(-1)

        # Make X row.
        X.append(data.iloc[pos:pos + window]['y'])

        # Move to the next window
        pos += window
    print("Done!")

    # Build and fit the SVM.
    print("Training SVM on all data accelerometer data...")
    X = np.array(X)
    y = np.array(y)
    #clfs = LinearSVC()
    clfs = SVC()
    clfs.fit(X, y)
    print("Done!")

    # print("Predicting accelerometer classes on all data using SVM...")
    # ypred = predict(X, clfs.coef_.reshape(len(clfs.coef_.ravel()), 1))
    # print("Done!")
    # error = calculateTotalAbsoluteError(y, ypred) / y.shape[0]
    # print("Accelerometer training error (Means kind of nothing): %f"%error)

    # Cross validation
    print("Training SVM on accelerometer training only data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size = 0.1) #, random_state = 0
    clfs = SVC()
    clfs.fit(X_train, y_train)
    yhat = clfs.predict(X_test)
    print("Abs Error = %f"%( calculateTotalAbsoluteError(yhat, y_test)/len(yhat)))
    print("Test data mean accuracy SVM score: %f"%clfs.score(X_test, y_test))
    f1_c0 = f1_score(y_test, clfs.predict(X_test), pos_label=1, average='binary')
    #print("Test data f1 score for class -1: %f"%(f1_c0))
    print("Test data f1 score for class +1: %f" % (f1_c0))
    print("Done!")
Exemple #10
0
 def SVCClassify(self, x_train, y_train):
     '''
     Basic Support Vector Machine Classifier
     '''
     
     # the parameter can be set
     kernel = 'rbf'
     # init classifier and train it
     # if need the proba-predict result, parameter probability must be =True
     clf = SVC(kernel=kernel, probability=True)
     clf.fit(x_train, y_train)
     
     return clf
Exemple #11
0
    def SVCClassify(self, x_train, y_train):
        '''
        Basic Support Vector Machine Classifier
        '''

        # the parameter can be set
        kernel = 'rbf'
        # init classifier and train it
        # if need the proba-predict result, parameter probability must be =True
        clf = SVC(kernel=kernel, probability=True)
        clf.fit(x_train, y_train)

        return clf
def cross_validate(samples, labels, outputDir):
    '''
    Function to perform K-fold cross validation
    '''
    # K(=10) FOLD CROSS VALIDATION
    K = 10
    fold_samples, fold_labels = cv_split(samples, np.array(labels), K)
    log_loss = [['Log Loss'],[]]
    total_ll = 0.0
    for fold in range(K):
        samples_chunk = fold_samples[:fold] + fold_samples[fold+1:]
        labels_chunk = fold_labels[:fold] + fold_labels[fold+1:]
    
        #Training L1 logistic regression
        logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1')
        logRegrL1.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) )
    
        #Training SVM with linear kernel
        svmLin = SVC(kernel='linear', probability=True)
        svmLin.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) )
    
        #Training Random Forest Classifier
        rfc = RandomForestClassifier(n_estimators=100)
        rfc.fit( np.concatenate(samples_chunk, axis=0), np.concatenate(labels_chunk, axis = 0) )
    
        #TEST ON CROSS VALIDATION HOLD OUT SET
        val = [i for i in range(len(fold_labels[fold]))]
        id = 0
        for item in fold_samples[fold]:
            predictionL1 = logRegrL1.predict_proba(item)#first component is probability of 0 class, second is of class 1
            predictionSvmLin = svmLin.predict_proba(item)
            predictionRfc = rfc.predict_proba(item)
    
            #Taking the average of each of the model predictions as final health status prediction
            val[id] = (predictionL1[0][1] + predictionSvmLin[0][1] + predictionRfc[0][1])/3.0
            id = id + 1
    
        
        for i in range(len(fold_labels[fold])):
            total_ll += logloss(fold_labels[fold][i], val[i])
    
    
    log_loss[1] = total_ll/len(samples)
    #Save csv file in the output directory with name Dota2Val.csv
    np.savetxt(outputDir + "\\Dota2Val.csv", 
           log_loss,
           delimiter=',', 
           fmt='%s'
           )
def train_and_predict(samples, labels, feature_selector, inputDir, outputDir):
    #Training L1 logistic regression
    logRegrL1 = linear_model.LogisticRegression(C=1, penalty='l1')
    logRegrL1.fit(samples, labels)

    #Training SVM with linear kernel
    svmLin = SVC(kernel='linear', probability=True)
    svmLin.fit(samples, labels)

    #Training Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(samples, labels)

    #test set
    testDir = inputDir + "/set_test"
    testFiles = sorted([
        join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f))
    ],
                       key=numericalSort)

    #Read feature vectors of test images
    testSamples = cubeVoxelsVar(testFiles)
    testSamples = feature_selector.transform(testSamples)
    print(len(testSamples))

    #2D array to report final prediction in format (ID,Prediction)
    final = [[0 for j in range(2)] for i in range(139)]
    final[0][0] = 'ID'
    final[0][1] = 'Prediction'
    id = 1

    #Predict health status of test image using each of the 3 models trained above
    for item in testSamples:
        predictionL1 = logRegrL1.predict_proba(
            item
        )  #first component is probability of 0 class, second is of class 1
        predictionSvmLin = svmLin.predict_proba(item)
        predictionRfc = rfc.predict_proba(item)

        final[id][0] = id
        #Taking the average of each of the model predictions as final health status prediction
        final[id][1] = (predictionL1[0][1] + predictionSvmLin[0][1] +
                        predictionRfc[0][1]) / 3.0
        id = id + 1

    #Save csv file in the output directory with name final_sub.csv
    np.savetxt(outputDir + "/final_sub.csv", final, delimiter=',', fmt='%s')
Exemple #14
0
def learnModel(train):

    data = []
    for duplicate in train["is_duplicate"]:
        data.append(int(duplicate))

    znacajkePitanja = get_avg(train)
    svmKlasifikator = SVC(kernel='rbf',
                          verbose=True,
                          probability=True,
                          max_iter=10000)

    print("Learning started")
    tmStart = timer()
    svmKlasifikator.fit(znacajkePitanja, data)
    tmEnd = timer()
    print("Learning ended")
    print("Learning lasted", tmEnd - tmStart)

    joblib.dump(svmKlasifikator, 'Word2VecSVMNauceni.pkl')
    print("Spremljen je napredak ucenja")
 def train_all(self, g):
     X = np.concatenate([self.train_X, self.val_X], axis=0)
     if self.use_scale:
         self.scale.fit(X)
         X = self.scale.transform(X)
     for i in range(3):
         y = np.concatenate([self.train_y, self.val_y], axis=0)
         y[y!=i+1]=0
         y[y!=0]=1
         clf = SVC()
         clf.set_params(**g)
         self.model_a.append(clf.fit(X, y))
Exemple #16
0
def classifier_panchenko2016(X_train,
                             y_train,
                             X_test,
                             y_test,
                             separateClassifier=False):
    train_or_test_labels = ["train"
                            for i in y_train] + ["test" for i in y_test]
    y_train, X_train, y_test, X_test = outlier_removal(train_or_test_labels,
                                                       X_train + X_test,
                                                       y_train + y_test)

    y_train, X_train = features_extraction(
        y_train,
        X_train,
        separateClassifier=separateClassifier,
        featuresCount=100)

    y_test, X_test = features_extraction(y_test,
                                         X_test,
                                         separateClassifier=separateClassifier,
                                         featuresCount=100)

    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    classifier = SVC(kernel="rbf",
                     C=2e11,
                     gamma=2e-1,
                     max_iter=5000,
                     class_weight="balanced",
                     verbose=1)

    print("fitting")
    classifier.fit(X_train, y_train)

    print("testing")
    y_predictions = classifier.predict(X_test)  #, y_test)

    return y_test, y_predictions
 def train(self, g):
     self.model = []
     X = self.train_X.copy()
     if self.use_scale:
         self.scale.fit(X)
         X = self.scale.transform(X)
     for i in range(3):
         y = self.train_y.copy()
         y[y!=i+1]=0
         y[y!=0]=1
         clf = SVC()
         clf.set_params(**g)
         self.model.append(clf.fit(X, y))
    #Read age labels of training images
    labels = []
    for t in targets:
        labels.append(t[0])

    #Training LASSO regressor, alpha value tuned to produce best result when used alone on the test set
    regrL = linear_model.Lasso(alpha=15.0)
    regrL.fit(samples, labels)

    #Training Ridge regressor, alpha value tuned to produce best result when used alone on the test set
    regrR = linear_model.Ridge(alpha=1e-13, normalize=True)
    regrR.fit(samples, labels)

    #Training SVM with linear kernel
    regrS = SVC(kernel='linear')
    regrS.fit(samples, labels)

    #Training Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=100)
    rfc.fit(samples, labels)

    #test set
    testDir = inputDir + "\\set_test"
    testFiles = sorted([
        join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f))
    ],
                       key=numericalSort)

    #Read feature vectors of test images
    testSamples = readVoxels(testFiles)
    print(len(testSamples))
    X_train, Y_train = train_data_set.convert_2_binary_format()

    test_data_set = DataSet()
    test_data_set.load(config.get_value('test'), class_index, has_header=False)
    Xtest, Ytest = test_data_set.convert_2_binary_format_with(
        X_train.item_dict, Y_train.item_dict)
    Ytest = Ytest.flatten()

    class_count = train_data_set.number_of_classes()

    unexpected_rules = IOHelper.load_json_object(config.get_value('rules'))
    refined_unexpected_rules = filter_association_rules(unexpected_rules)

    print('svm testing...')
    svc_model = SVC(kernel='poly', degree=3, coef0=0.1, random_state=1)
    svc_model.fit(X_train.relation_matrix, Y_train.values.flatten())

    svc_y_pred = svc_model.predict(Xtest)
    print(f1_score(Ytest, svc_y_pred, average=None))
    if (class_count <= 2):
        fpr, tpr, _ = roc_curve(Ytest, svc_y_pred.flatten())
        print(auc(fpr, tpr))

    refine_with_unexpectedness(test_data_set, Y_train.item_dict, svc_y_pred,
                               Ytest, refined_unexpected_rules)

    print('Random forest testing...')
    rf_model = RandomForestClassifier(n_estimators=20, random_state=1)
    rf_model.fit(X_train.relation_matrix, Y_train.values.flatten())

    rf_y_pred = rf_model.predict(Xtest)
train_labels = []
test_arrays = []
test_labels = []

for email in emails:
    email_id = email.id
    prefix_train_pos = 'email_' + str(email_id)
    if email_id % 5 != 0:
        train_arrays.append(model.docvecs[prefix_train_pos])
        train_labels.append(int(email.label))
    else:
        test_arrays.append(model.docvecs[prefix_train_pos])
        test_labels.append(int(email.label))
        
classifier = SVC()
classifier.fit(numpy.array(train_arrays), numpy.array(train_labels))

print("Overall score is %f." % classifier.score(numpy.array(test_arrays), numpy.array(test_labels)))

corrects = []
wrongs = []
for email in emails:
    email_id = email.id
    prefix_train_pos = 'email_' + str(email_id)
    if email_id % 5 == 0:
        prediction = classifier.predict([model.docvecs[prefix_train_pos]])[0]
        actual = int(email.label)
        if prediction != actual:
            wrongs.append((email.id, prediction, actual))
        else:
#             print(max(classifier.predict_proba([model.docvecs[prefix_train_pos]])[0]), actual)
Exemple #21
0
def train_and_predict(samples, labels, feature_selector, inputDir, outputDir):
    #test set
    testDir = inputDir + "\\set_test"
    testFiles = sorted([
        join(testDir, f) for f in listdir(testDir) if isfile(join(testDir, f))
    ],
                       key=numericalSort)

    # Different features for gender
    testSamples_gender = cubeVoxelsVar_gender(testFiles)

    # Same features for age and health
    testSamples_age = cubeVoxelsVar_age(testFiles)
    testSamples_health = testSamples_age

    testSamples = [testSamples_gender, testSamples_age, testSamples_health]

    #2D array to report final prediction in format (ID,Prediction)
    final = [[0 for j in range(4)] for i in range(1 + 138 * 3)]
    final[0][0] = 'ID'
    final[0][1] = 'Sample'
    final[0][2] = 'Label'
    final[0][3] = 'Predicted'

    total_labels = ['gender', 'age', 'health']

    for label in range(3):
        print("Prediction label 1 started!")
        id_count = label
        #Training logistic regression
        logRegrL1 = linear_model.LogisticRegression()
        logRegrL1.fit(samples[label], labels[label])

        #Training SVM with linear kernel
        svmLin = SVC(kernel='linear')
        svmLin.fit(samples[label], labels[label])

        #Training Random Forest Classifier
        rfc = RandomForestClassifier(n_estimators=100)
        rfc.fit(samples[label], labels[label])

        print("Training complete!")

        # Do feature selection only for age and health
        if label == 0:
            testSamples_curr = testSamples[label]
        else:
            testSamples_curr = feature_selector[label].transform(
                testSamples[label])
        print(len(testSamples_curr))

        id = label + 1

        #Predict gender, age and health status of test image using each of the 3 models trained above
        for sampleNum, sample in enumerate(testSamples_curr):
            predictionL1 = logRegrL1.predict(sample)
            predictionSvmLin = svmLin.predict(sample)
            predictionRfc = rfc.predict(sample)

            final[id][0] = id_count
            final[id][1] = sampleNum
            final[id][2] = total_labels[label]

            votes = predictionL1[0] + predictionSvmLin[0] + predictionRfc[0]

            final[id][3] = 'TRUE' if votes >= 2.0 else 'FALSE'
            id = id + 3
            id_count = id_count + 3
        print('Prediction done!')

    #Save csv file in the output directory with name final_sub.csv
    np.savetxt(outputDir + "\\final_sub.csv", final, delimiter=',', fmt='%s')
    "Sidorovbigramsdeprel"
]

print("train: ", len(tweets_training))
print("test: ", len(tweets_test))

X, X_test, feature_name, feature_index = feature_manager.create_feature_space(
    tweets_training, feature_type, tweets_test)

print(feature_name)
print("feature space dimension X:", X.shape)
print("feature space dimension X_test:", X_test.shape)

clf = SVC(kernel="linear")

clf.fit(X, labels_training)
test_predict = clf.predict(X_test)
"""prec, recall, f, support = precision_recall_fscore_support(
labels_test,
test_predict,
beta=1)

accuracy = accuracy_score(
test_predict,
labels_test
)

print(prec, recall, f, support )
print(accuracy)"""

for i in range(0, len(tweets_test)):
Exemple #23
0
            label_string += '0 '
    fopen.close()

    # generate the
    label_vec = numpy.fromstring(label_string.strip(), dtype=int, sep=' ')
    print 'Totally we get %s labels' % (label_vec.shape[0])  # for debugging

    # create the feature matrix, in which each row represents a video
    video_num = len(video_list)
    feat_mat = numpy.zeros([video_num, feat_dim])
    for i in xrange(video_num):
        # BOW features of this video
        feat_vec = numpy.genfromtxt(feat_dir + video_list[i],
                                    dtype=numpy.float32,
                                    delimiter=";")
        assert (feat_vec.shape[0] == feat_dim)
        # fill the feature vector to the matrix
        feat_mat[i, :] = feat_vec

    # initialize svm
    svm = SVC(kernel=chi2_kernel)
    #    svm = SVC(probability=True)

    # train the svm models
    svm.fit(feat_mat, label_vec)

    # finally save the k-means model
    cPickle.dump(svm, open(output_file, "wb"), cPickle.HIGHEST_PROTOCOL)

    print 'SVM trained successfully for event %s!' % (event_name)
        audio_name=line.split(" ")[0]
        #print count
        count=count+1
 #       if (count%100==0):
#	    print count
        label=line.split(" ")[1].split("\n")[0]
        if "imtraj" in feat_dir: 
            feat_vec=import_imtraj_txt(feat_dir+audio_name+".spbof")
        else:
            feat_vec=np.genfromtxt(feat_dir+audio_name,delimiter=";")
        if (label==event_name):
            label=1
            pos_count+=1
        else:
            label=0
            neg_count+=1
        if len(X)==0:
            X=[feat_vec]
        else:
            X=np.append(X,[feat_vec],axis=0)
        Y=Y+[label]
    
    print "Data loading finished positive "+str(pos_count)+" negative "+str(neg_count)
    #pipe_lrSVC=SVC(C=10,gamma=0.0001,probability=True)
    pipe_lrSVC=SVC(probability=True)
    #svm=LinearSVC(C=10)
    #pipe_lrSVC=CalibratedClassifierCV(svm)
    pipe_lrSVC.fit(preprocessing.scale(X),Y)
    pickle.dump(pipe_lrSVC,open(output_file+'.pickle','wb'))
    print 'SVM trained successfully for event %s!' % (event_name)+" round num %s" % (round_num)
Exemple #25
0
    X_val, Y_val = get_data("val_esea_real.lst", ranks)

    trn_embedding = embed(X_trn, triplet_model)
    val_embedding = embed(X_val, triplet_model)

    # print (X_trn, X_val)
    # print (trn_embedding, val_embedding)
    # print (triplet_model.get_weights())

    clf = SVC(
        # class_weight='balanced',
        probability=True,
        # tol=1e-4,
    )

    clf.fit(trn_embedding, Y_trn)

    print(clf.score(val_embedding, Y_val))
    print(clf.predict_proba(val_embedding))

    print(roc_auc_score(Y_val, clf.predict(val_embedding)))
    print(classification_report(Y_val, clf.predict(val_embedding), digits=4))

    all_files = [x[:-8] for x in os.listdir(ALL_FILES)]
    X = [
        pickle.load(open(os.path.join(FEATURE_PATH, x + ".fkmeans"), "rb"),
                    encoding='latin1') for x in all_files
    ]
    # Y = [ranks[x.split()[0].strip()] for x in all_files]

    proba = clf.predict_proba(embed(np.array(X), triplet_model))
    # read in features
    features = []
    for video_id in video_ids:
        feat_path = feat_dir + video_id + "." + feat_suffix
        feature = [0]*feat_dim
        if os.path.exists(feat_path) is True:
            if feat_type == 'dense':
                feature = numpy.genfromtxt(feat_path, delimiter=';')
            else:
                line = numpy.genfromtxt(feat_path, delimiter=' ', dtype=str)
                if len(line.shape) == 0:
                    line = numpy.array([line])
                for item in line:
                    if len(item) == 0:
                        continue
                    tokens = item.split(':')
                    key = int(tokens[0])-1
                    value = float(tokens[1])
                    if key < feat_dim:
                        feature[key] = value
        features.append(feature)

    # train svm
    clf = SVC(probability=True)
    clf.fit(features, labels)
    # Dump model
    with open(output_file, 'wb') as f:
        cPickle.dump(clf, f)

    print 'SVM trained successfully for event %s!' % (event_name)
                [list(feature_names).index(f) for f in feature_filtered])
            feature_index_filtered = numpy.concatenate(
                feature_index_global[list(feature_index_filtered)])
            #print(feature_name_global[feature_index_filtered])
            X_filter = X[:, feature_index_filtered]
            #print(feature_filtered,X.shape,X_filter.shape)
            predict = []
            golden = []
            for index_train, index_test in kf:

                X_train = X_filter[index_train]
                X_test = X_filter[index_test]

                clf = SVC(kernel='linear')

                clf.fit(X_train, stance[index_train])
                test_predict = clf.predict(X_test)
                predict = numpy.concatenate((predict, test_predict))
                golden = numpy.concatenate((golden, stance[index_test]))

            prec, recall, f, support = precision_recall_fscore_support(golden,
                                                                       predict,
                                                                       beta=1)

            accuracy = accuracy_score(golden, predict)

            print('"' + (' '.join(feature_filtered)) + '"' + '\t' +
                  str(((f[0] + f[1] + f[2]) / 3)) + '\t' +
                  str(((f[1] + f[2]) / 2)) + '\t' + str(prec) + '\t' +
                  str(recall) + '\t' + str(f) + '\n')
Exemple #28
0
    feat_dir = "kmeans/"
    feat_dim = 50
    output_file = "mfcc_pred/svm.%s.model" % event_name

    fread = open("list/train", "r")
    clf = SVC(probability=True)
    X, Y = [], []
    for i in fread.readlines():
        i = i.split(" ")
        line = i[0]
        label = i[1].replace('\n', '')
        kmeans_path = "kmeans/" + line + ".kmeans.txt"
        if os.path.exists(kmeans_path):
            kmeans_feat = numpy.genfromtxt(kmeans_path, delimiter=";")
        else:
            kmeans_feat = numpy.zeros(feat_dim)
            label = "NULL"
        if label != event_name:
            label = "NULL"
        X.append(kmeans_feat)
        Y.append(label)
    X = numpy.array(X)
    Y = numpy.array(Y)
    clf.fit(X, Y)
    cPickle.dump(clf, open(output_file, "wb"))
    print " "
    t2 = time.time() - t1
    print "Time taken for training %s SVM : %f seconds" % (event_name, t2)

    print 'SVM trained successfully for event %s!' % event_name
Exemple #29
0
                Y_all = Y
                i = 1
            else:
                X_all = np.vstack((X_all, X))
                Y_all = np.append(Y_all, Y)
                i += 1

    clf = SVC(kernel=laplacian_kernel)

    features = ['.cnn.', '.mfcc.', '.asr.']
    X_score = np.zeros((len(X_all), 3))

    j = 0
    for j, feature in enumerate(features):
        X = X_all[:, dimension_i[j]:dimension_i[j] + dimension_i[j + 1]]
        clf.fit(X, Y_all)
        print('saving scores...')
        X_score[:, j] = clf.decision_function(X)
        # print (X_score[:, j])
        np.save(event_name + feature + 'score', X_score[:, j])
        cPickle.dump(clf, open(output_file + feature + 'score', "wb"))

    clf = SVC(kernel='linear')
    clf.fit(X_score, Y_all)

    fread.close()

    cPickle.dump(clf, open(output_file, "wb"))

    print 'SVM trained successfully for event %s!' % (event_name)
Exemple #30
0
                                                    test_size=0.2,
                                                    random_state=42)

#:# preprocessing

transform_pipeline = Pipeline([('scaler', StandardScaler())])

X_train = pd.DataFrame(transform_pipeline.fit_transform(X_train),
                       columns=X_train.columns)

#:# model

params = {'gamma': 5, 'kernel': 'sigmoid', 'probability': True}

classifier = SVC(**params)
classifier.fit(X_train, y_train)

#:# hash
#:# aad366f6d5961bc98783c2ad9fb3918d
md5 = hashlib.md5(str(classifier).encode('utf-8')).hexdigest()
print(f'md5: {md5}')

#:# audit
y_pred = classifier.predict(transform_pipeline.transform(X_test))
y_pred_proba = classifier.predict_proba(
    transform_pipeline.transform(X_test))[:, 1]

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

print(f'acc: {accuracy_score(y_test, y_pred)}')
print(f'auc: {roc_auc_score(y_test, y_pred_proba)}')
Exemple #31
0
    file_read.close()

    # Highly imbalanced data for training
    X_df = np.array(X_df)
    y_df = np.array(y_df)
    #print(Counter(y_df))

    #print(X_df.shape)

    # Add SMOTE resampling for dealing with imbalanced issue
    smote = SMOTE()
    X_df_res, y_df_res = smote.fit_sample(X_df, y_df)
    #print(Counter(y_df_res))

    # Train svm
    clf = SVC(kernel=chi2_kernel,
              class_weight='balanced',
              C=2.0,
              gamma='scale',
              probability=True,
              random_state=0)
    #    clf = SVC(kernel = 'rbf', class_weight='balanced',
    #              C = 1.0, gamma = 'scale', probability=True, random_state=0)
    #    clf.fit(X_df, y_df)
    clf.fit(X_df_res, y_df_res)

    # Save the SVC model
    with open(output_file, 'w') as f:
        cPickle.dump(clf, f)

    print 'SVM trained successfully for event %s!' % (event_name)
Exemple #32
0
            if Y_label != 'NULL' or random.random() > 0:
                if Y_label == event_name:
                    Y = 1
                else:
                    Y = 0

                if i == 0:
                    X_all = X
                    Y_all = Y
                    i = 1
                else:
                    X_all = np.vstack((X_all, X))
                    Y_all = np.append(Y_all, Y)
                    i += 1
        # print (i)
    # print (np.sum(X_all, axis = 1))
    # print(X_all, Y_all)

    clf = SVC(kernel=chi2_kernel)
    # clf = SVC()
    clf.fit(X_all, Y_all)

    print(clf.score(X_all, Y_all))
    print(clf.predict(X_all))

    fread.close()

    cPickle.dump(clf, open(output_file, "wb"))

    print 'SVM trained successfully for event %s!' % (event_name)
Exemple #33
0
                i += 1

    clf = SVC(kernel=laplacian_kernel)

    features = ['.cnn.', '.mfcc.', '.asr.', '.cnn+mfcc.', '.cnn+mfcc+asr.']
    X_score = np.zeros((len(X_all), len(features)))

    j = 0
    for j, feature in enumerate(features):
        if j <= 2:
            X = X_all[:, dimension_i[j]:dimension_i[j] + dimension_i[j + 1]]
        elif j == 3:
            X = X_all[:, 0:150]
        elif j == 4:
            X = X_all
        clf.fit(X, Y_all)
        print('saving scores...')
        X_score[:, j] = clf.decision_function(X)
        # print (X_score[:, j])
        np.save(event_name + feature + 'score', X_score[:, j])
        cPickle.dump(clf, open(output_file + feature + 'score', "wb"))

    clf = SVC(kernel='linear')
    clf.fit(X_score[:, [0, 1, 2, 4]], Y_all)

    fread.close()

    cPickle.dump(clf, open(output_file, "wb"))

    print 'SVM trained successfully for event %s!' % (event_name)