def test_build_extraction_dataset(): if os.path.exists(os.path.join(TMP_DIR, 'extraction.data')): os.remove(os.path.join(TMP_DIR, 'extraction.data')) d.build_extraction_dataset(os.path.join(EMAILS_DIR, 'P'), os.path.join(TMP_DIR, 'extraction.data'), 1) test_data = SparseDataSet(os.path.join(TMP_DIR, 'extraction.data'), labelsColumn=-1) # the result is a loadable signature extraction dataset # 32 comes from 3 emails in emails/P folder, 11 lines checked to be # a signature, one email has only 10 lines eq_(test_data.size(), 32) eq_(len(features('')), test_data.numFeatures)
def train(classifier, train_data_filename, save_classifier_filename=None): '''Trains and saves classifier so that it could be easily loaded later.''' data = SparseDataSet(train_data_filename, labelsColumn=-1) classifier.train(data) if save_classifier_filename: classifier.save(save_classifier_filename) return classifier
def load(saved_classifier_filename, train_data_filename): """Loads saved classifier. Classifier should be loaded with the same data it was trained against """ train_data = SparseDataSet(train_data_filename, labelsColumn=-1) classifier = init() classifier.load(saved_classifier_filename, train_data) return classifier
def train_test(ds_path): data = SparseDataSet(ds_path) g, c, fold = 0.25, 128, 2 ################################################## #### This part of the code does Does statistical #### feature selection ... #labels = np.array([int(n) for n in data.labels.L]) #ranks = rank_feat(data.getMatrix().T, labels) #ranks = [(abs(r),i) for i, r in enumerate(ranks)] #ranks.sort() #ranks.reverse() #feats = [f[1] for f in ranks] #data.keepFeatures(feats[:2662]) data.attachKernel('gaussian', gamma = g) s=SVM(C=c) r = s.cv(data, numFolds=fold) o = open(ds_path+'.pkl', 'wb') pickle.dump(r, o) o.close(); print ds_path
def train_test(ds_path): data = SparseDataSet(ds_path) g, c, fold = 0.25, 128, 2 ################################################## #### This part of the code does Does statistical #### feature selection ... #labels = np.array([int(n) for n in data.labels.L]) #ranks = rank_feat(data.getMatrix().T, labels) #ranks = [(abs(r),i) for i, r in enumerate(ranks)] #ranks.sort() #ranks.reverse() #feats = [f[1] for f in ranks] #data.keepFeatures(feats[:2662]) data.attachKernel('gaussian', gamma=g) s = SVM(C=c) r = s.cv(data, numFolds=fold) o = open(ds_path + '.pkl', 'wb') pickle.dump(r, o) o.close() print ds_path
def train(self, datalist, labelslist): data = SparseDataSet(datalist, L = labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree = 5) self.svminstance.train(data)
def is_signature_line(line, sender, classifier): '''Checks if the line belongs to signature. Returns True or False.''' data = SparseDataSet([build_pattern(line, features(sender))]) return classifier.decisionFunc(data, 0) > 0
from PyML import SparseDataSet, SVM __author__ = 'basir' data = SparseDataSet('data/heartSparse.data', labelsColumn=-1) svm = SVM() res = svm.cv(data, 5) for fold in res: print fold print res # print data # help(sequenceData.spectrum_data)
def trainforTD(self, datalist, labelslist): data = SparseDataSet(datalist, L=labelslist) self.svminstance.train(data)
def train(self, datalist, labelslist): data = SparseDataSet(datalist, L=labelslist) self.svminstance.C = 20 data.attachKernel('gaussian', degree=5) self.svminstance.train(data)
def predict(self, datalist): data = SparseDataSet(datalist) results = self.svminstance.test(data) return results.getPredictedLabels()[0]