def fit(self, X, y): # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. self.transformer_ = LinearSVC(C=1000, penalty="l1", dual=False, tol=1e-3) X = self.transformer_.fit_transform(X, y) return LinearSVC.fit(self, X, y)
def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accuracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain): self._dbDir = dbDir self._sourceDomain = sourceDomain self._rawDataFolder = rawDataFolder self._targetDomain = targetDomain self._tableName = sourceDomain + "to" + targetDomain self._connection = sqlite.connect(path.join(dbDir,sourceDomain)) self._cursor = self._connection.cursor() self._lsvc = LinearSVC(C=10000)
def train(labeled_featuresets, C=1e5): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ feat = [featureset for featureset, label in labeled_featuresets] feature_vectorizer = MVectorizer.DictsVectorizer() X = feature_vectorizer.fit_transform(feat) X = Normalizer().fit_transform(X) label_set = set( [label for featureset, label in labeled_featuresets] ) label_vectorizer = dict( [(label,num) for num,label in enumerate(label_set)] ) y = numpy.array([label_vectorizer[label] for featureset, label in labeled_featuresets]) print "Training on %d examples with %d features..."%(X.shape[0],X.shape[1]), classifier = OneVsRestClassifier(LinearSVC(loss='l2', penalty='l2', dual=True, tol=1e-5, C=C, scale_C=True)) classifier.fit(X,y) print "done" return scikit_classifier(feature_vectorizer,label_vectorizer,classifier)
print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01))
# # Feature selection for the L1 dataset # select_chi2 = 1000 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X_L1 = ch2.fit_transform(X_L1, y_L1) # print "Done in %fs" % (time() - t0) # print "L1: n_samples: %d, n_features: %d" % X_L1.shape # print # Train L1 classifier print "Training L1 Classifier..." t0 = time() clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) print clf clf.fit(X_L1, y_L1) train_time = time() - t0 print "Train time: %0.3fs" % train_time print # Train L2 classifiers print "Training L2 Classifiers..." t0 = time() # comment out all linearSVC # clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2) # clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
# X: feature matrix; y: result array; z_k: prediction result array for k's model # # Setup 10 fold cross validation fold_num = 10 kf = KFold(n_samples, k=fold_num, indices=True) # set number of neighbors for kNN n_neighb = 19 # Brute-force implementation clf_bNB = BernoulliNB(alpha=.01) clf_mNB = MultinomialNB(alpha=.01) clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb) clf_ridge = RidgeClassifier(tol=1e-1) clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf_SVC = SVC(C=1000, gamma=0.0625, probability=True) # clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") ############################################################################### # Stacking # # initialize empty y and z n_categories = len(set(y)) # z = np.array([[0, 0]], dtype=float) z = np.array([[0, 0, 0]], dtype=float) # z = np.array([[0, 0, 0, 0]], dtype=float) # z = np.zeros( (n_samples, n_categories) , dtype=float) # Test for 10 rounds using the results from 10 fold cross validations
def predict(self, X): X = self.transformer_.transform(X) return LinearSVC.predict(self, X)
from preprocess import get_clf, load_data, preprocess_data from sklearn.metrics import classification_report from sklearn.cross_validation import KFold, LeaveOneOut from sklearn.grid_search import GridSearchCV if __name__ == '__main__': filename = 'inf-all-labeled.txt' X, y = load_data(filename) n = len(X) scores = np.empty((5, 2, 2), dtype=np.float) best_C = np.empty((5, 2, 2), dtype=np.float) for i, ngrams in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(('', '$')): for k, binarize in enumerate((True, False)): print "ngrams=%d, suffix=%s, binarize=%s" % (ngrams, suffix, binarize) X_new = preprocess_data(X, n=ngrams, suffix=suffix, binarize=binarize) grid = GridSearchCV( estimator=LinearSVC(), n_jobs=4, verbose=False, param_grid={'C': (0.01, 0.03, 0.1, 0.3, 1, 1.3)}, cv=LeaveOneOut(n, indices=True)) grid.fit(X_new, y) scores[i, j, k] = grid.best_score best_C[i, j, k] = grid.best_estimator.C
for i, n in enumerate((2, 3, 4, 5, 6)): for j, suffix in enumerate(('', '$')): for k, binarize in enumerate((True, False)): print "%d-%d-%d out of 411" % (i, j, k) X_sg_p, v_sg = preprocess.preprocess_data(X_sg, suffix=suffix, n=n, return_vect=True, binarize=binarize) X_pl_p, v_pl = preprocess.preprocess_data(X_pl, suffix=suffix, n=n, return_vect=True, binarize=binarize) grid1 = GridSearchCV(estimator=LinearSVC(), n_jobs=-1, verbose=True, param_grid={'C': np.logspace(-2, 2, 5)}, cv=KFold(len(X_sg), k=10, indices=True)) grid1.fit(X_sg_p, y_sg) scores_sg[i, j, k] = grid1.best_score best_C_sg = grid1.best_estimator.C clf = grid1.best_estimator X_sg_n_p = v_sg.transform(X_sg_n) y_sg_n = clf.predict(X_sg_n_p) predict_sg[i, j, k] = (y_sg_n == 0).mean() grid2 = GridSearchCV(estimator=LinearSVC(), n_jobs=-1,
return unicode_content.lower() def __repr__(self): return "LowerCasePreprocessor()" analyzer1 = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer1)), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(twenty_train.data,twenty_train.target) # Predict the outcome on the testing set y_predicted = clf.predict(doc_test) # Predict the result on some short new sentences: sentences = [ u'This is a language detection test.', u'Ceci est un test de d\xe9tection de la langue.', u'Dies ist ein Test, um die Sprache zu erkennen.', ]
"Like most web-based services, Ning automatically receives and records information on our server logs from your browser when you use the Ning Platform. We may use a variety of methods, including clear GIFs (also known as web beacons), and cookies to collect this information. The information that we collect with these automated methods may include, for example, your IP address, Ning cookie information, a unique device or user ID, browser type, system type, the content and pages that you access on the Ning Platform, and the referring URL (i.e., the page from which you navigated to the Ning Platform).", "Other Information We Receive and Store : When you register to use MailChimp, we store 'cookies,' which are strings of code, on your computer. We also use electronic images known as Web beacons. With those cookies, we are aware of and collect information concerning when you visit our Website, when you use MailChimp, your browser type and version, your operating system and platform and other similar information. With Web beacons, we can determine when you open email we send you, and collect other data. You may turn off all cookies that have been placed on your computer by following the instructions on your browser on how to block cookies that have been placed on your computer. However, if you block our cookies it will be more difficult, and maybe impossible, to use the Services", "EMC strives to keep your personal information accurate. We have implemented technology, management processes and policies to maintain data integrity. We will provide you with access to your information when reasonable, or in accordance with relevant laws, including making reasonable effort to provide you with online access and the opportunity to change your information. To protect your privacy and security, we will take steps to verify your identity before granting access or making changes to your personal information. To access and/or correct information, you can do so online or notify us via the appropriate method below depending on which site is at issue", "Your information to our service providers. We use service providers who help us to provide you with our services. We give relevant persons working for some of these providers access to your information, but only to the extent necessary for them to perform their services for us. We also implement reasonable contractual and technical protections to ensure the confidentiality of your personal information and data is maintained, used only for the provision of their services to us, and handled in accordance with this privacy policy. Examples of service providers include payment processors, email service providers, and web traffic analytics tools", "Some Microsoft sites allow you to choose to share your personal information with select Microsoft partners so that they can contact you about their products, services or offers. Other sites, such as MSN instead may give you a separate choice as to whether you wish to receive communications from Microsoft about a partner's particular offering (without transferring your personal information to the third party). See the Communication Preferences section below for more information.", ] X_new = vectorizer.transform(docs_new) # Train classifiers print "Training Classifiers..." t0 = time() clf_nb = MultinomialNB() clf_lsvc = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf_svc = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True) clf_rdg = RidgeClassifier(tol=1e-1) clf_sgd = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # Logistic regression requires OneVsRestClassifier which hides # its methods such as decision_function # It will require extra implementation efforts to use it as a candidate # for multilabel classification # clf_lgr = OneVsRestClassifier(LogisticRegression(C=1000,penalty='l1')) # kNN does not have decision function due to its nature # clf_knn = KNeighborsClassifier(n_neighbors=13) # train clf_nb.fit(X, y) clf_lsvc.fit(X, y)
class SpectralFeatureAlignment(): def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain): self._dbDir = dbDir self._sourceDomain = sourceDomain self._rawDataFolder = rawDataFolder self._targetDomain = targetDomain self._tableName = sourceDomain + "to" + targetDomain self._connection = sqlite.connect(path.join(dbDir,sourceDomain)) self._cursor = self._connection.cursor() self._lsvc = LinearSVC(C=10000) def _getFeatures(self, maxDIFeatures=500, minFrequency=5): features = [] self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency]) features = [a[0] for a in self._cursor.fetchall()] self._cursor.execute("SELECT term FROM mostinformatives") mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000]) features = [feature for feature in features if feature not in mostInformatives] return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:]) def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures): domainIndependentFeaturesSet = set(domainIndependentFeatures) domainDependentFeaturesSet = set(domainDependentFeatures) def __parseFile(filePath): with open(filePath, "r") as f: for review in f: reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()]) independentFeatures = reviewFeatures & domainIndependentFeaturesSet dependentFeatures = reviewFeatures & domainDependentFeaturesSet for dependentFeature in dependentFeatures: rowIndex = bisect_left(domainDependentFeatures,dependentFeature) for independentFeature in independentFeatures: matrix[rowIndex, bisect_left(domainIndependentFeatures,independentFeature)] += 1 matrix = np.zeros((len(domainDependentFeatures), len(domainIndependentFeatures))) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review")) return sparse.coo_matrix(matrix) def _createSquareAffinityMatrix(self, cooccurrenceMatrix): height = np.size(cooccurrenceMatrix, 0) width = np.size(cooccurrenceMatrix, 1) topMatrix = sparse.coo_matrix((height,height)) topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix)) bottomMatrix = sparse.coo_matrix((width,width)) bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix)) matrix = sparse.vstack((topMatrix, bottomMatrix)) return matrix def _createDiagonalMatrix(self, squareAffinityMatrix): rows = range(squareAffinityMatrix.get_shape()[0]) data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)] return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1])) def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain): numDomainDep = len(domainDependentFeatures) numDomainIndep = len(domainIndependentFeatures) domainDepSet = set(domainDependentFeatures) domainIndepSet = set(domainIndependentFeatures) documentVectors = [] classifications = [] def __parseFile(filePath): with open(filePath,"r") as f: for review in f: classification = 1 if "#label#:positive" in review else -1 reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel] reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList} reviewFeatures = set(reviewDict.keys()) domainDepReviewFeatures = domainDepSet & reviewFeatures domainIndepReviewFeatures = domainIndepSet & reviewFeatures domainDepValues,domainDepIndizes = [],[] domainIndepValues, domainIndepIndizes = [],[] for feature in domainIndepReviewFeatures: #domainIndepValues.append(reviewDict[feature]) domainIndepValues.append(1) domainIndepIndizes.append(bisect_left(domainIndependentFeatures,feature)) for feature in domainDepReviewFeatures: #domainDepValues.append(reviewDict[feature]) domainDepValues.append(1) domainDepIndizes.append(bisect_left(domainDependentFeatures,feature)) domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)),shape=(1,numDomainIndep)) domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep)) documentVectors.append((domainIndepVector,domainDepVector)) classifications.append(classification) __parseFile(path.join(self._rawDataFolder, domain, "positive.review")) __parseFile(path.join(self._rawDataFolder, domain, "negative.review")) return documentVectors,classifications def _trainClassifier(self, trainingVectors, classifications): self._lsvc.fit(sparse.vstack(trainingVectors),classifications) def _testClassifier(self,testVectors,classifications): return self._lsvc.score(sparse.vstack(testVectors),classifications) def go(self,K=100, Y=6, DI=500, minFreq=5): print self._sourceDomain + " -> " + self._targetDomain domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq) numDomainIndep = len(domainIndependentFeatures) numDomainDep = len(domainDependentFeatures) #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep) #print "creating cooccurrenceMatrix..." a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures) #print "creating SquareAffinityMatrix..." a = self._createSquareAffinityMatrix(a) #print "creating DiagonalMatrix..." b = self._createDiagonalMatrix(a) #print "multiplying..." c = b.dot(a) del a c = c.dot(b) del b #print "calculating eigenvalues and eigenvectors" eigenValues,eigenVectors = eigsh(c, k=K, which="LA") del c #print "building document vectors..." documentVectorsTraining,classificationsTraining = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain) documentVectorsTesting,classificationsTesting = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain) #print "training and testing..." U = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]] U = np.concatenate(U,axis=1)[:numDomainDep] U = sparse.csr_matrix(U) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining] trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))] clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting] testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))] self._trainClassifier(trainingVectors, classificationsTraining) print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i" % (self._testClassifier(testVectors,classificationsTesting)*100,K,DI,Y,minFreq)
print # # Feature selection for the L1 dataset # select_chi2 = 1000 # print ("Extracting %d best features by a chi-squared test" % select_chi2) # t0 = time() # ch2 = SelectKBest(chi2, k = select_chi2) # X_L1 = ch2.fit_transform(X_L1, y_L1) # print "Done in %fs" % (time() - t0) # print "L1: n_samples: %d, n_features: %d" % X_L1.shape # print # Train L1 classifier print "Training L1 Classifier..." t0 = time() clf = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) print clf clf.fit(X_L1, y_L1) train_time = time() - t0 print "Train time: %0.3fs" % train_time print # Train L2 classifiers print "Training L2 Classifiers..." t0 = time() # comment out all linearSVC # clf_ca = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_collect = LinearSVC(loss='l2', penalty='l2', C=256, dual=False, tol=1e-2) # clf_cookies = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) # clf_share = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3)
def find_best_lsvc(**params): parameters = {'C': [0.01, 0.1, 1, 10, 100, 1000]} return GridSearchCV(LinearSVC(**params), parameters)
# N: number for training examples; K: number of models in level 0 # X: feature matrix; y: result array; z_k: prediction result array for k's model # # Setup 10 fold cross validation fold_num = 10 kf = KFold(n_samples, k=fold_num, indices=True) # set number of neighbors for kNN n_neighb = 19 # Brute-force implementation clf_mNB = MultinomialNB(alpha=.01) clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb) clf_ridge = RidgeClassifier(tol=1e-1) clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3) clf_SVC = SVC(C=32, gamma=0.0625) # clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") # empty ndarrays for predication results z_kn z_mNB = np.array([], dtype=np.int32) z_kNN = np.array([], dtype=np.int32) z_ridge = np.array([], dtype=np.int32) z_lSVC = np.array([], dtype=np.int32) z_SVC = np.array([], dtype=np.int32) ############################################################################### # Stacking # # initialize empty y and z
class SpectralFeatureAlignment(): def __init__(self, dbDir, rawDataFolder, sourceDomain, targetDomain): self._dbDir = dbDir self._sourceDomain = sourceDomain self._rawDataFolder = rawDataFolder self._targetDomain = targetDomain self._tableName = sourceDomain + "to" + targetDomain self._connection = sqlite.connect(path.join(dbDir,sourceDomain)) self._cursor = self._connection.cursor() self._lsvc = LinearSVC(C=10000) self._featuresWithSynsets = {} self._featuresWithoutSynsets = {} self._allSynsets = [] def _getFeatures(self, maxDIFeatures=500, minFrequency=5): features = [] self._cursor.execute("SELECT term FROM " +self._tableName+ " WHERE freqSource + freqTarget >= ?", [minFrequency]) features = [a[0] for a in self._cursor.fetchall()] self._cursor.execute("SELECT term FROM mostinformatives") mostInformatives = set([a[0] for a in self._cursor.fetchall()][30000:-30000]) features = [feature for feature in features if feature not in mostInformatives] return sorted(features[:maxDIFeatures]), sorted(features[maxDIFeatures:]) def _getSynsets(self, domainIndependentFeatures, minSyn): #unigramTagger = UnigramTagger(brown.tagged_sents(simplify_tags=True)) #bigramTagger = BigramTagger(brown.tagged_sents(simplify_tags=True), backoff=unigramTagger) #taggedBigrams = [bigramTagger.tag(feature.split('_')) for feature in domainIndependentFeatures if "_" in feature and "<" not in feature] #tmp = ("PRO", "CNJ", "DET", "EX", "MOD", "P", "TO") #for x in taggedBigrams: #firstWord,firstTag = x[0] #secondWord,secondTag = x[1] #feature = "_".join((firstWord,secondWord)) #if firstTag in tmp and secondTag not in tmp: #self._featuresWithSynsets[feature] = wn.synsets(secondWord) #elif firstTag not in tmp and secondTag in tmp: #self._featuresWithSynsets[feature] = wn.synsets(firstWord) Bigrams = [feature for feature in domainIndependentFeatures if "_" in feature and "<" not in feature] #filterWords = ("a", "and", "are", "be", "has", "have", "i", "is", "it", "of", "the", "to", "will", "had", "as", "my", "that", "was") stopwordList = set(stopwords.words("english")) - set(("no", "nor", "not")) for bigram in Bigrams: firstWord, secondWord = bigram.split("_") if firstWord in stopwordList and secondWord in stopwordList: pass elif firstWord in stopwordList: self._featuresWithSynsets[bigram] = wn.synsets(secondWord) elif secondWord in stopwordList: self._featuresWithSynsets[bigram] = wn.synsets(firstWord) self._featuresWithSynsets = {feature:[str(synset) for synset in synsets] for feature,synsets in self._featuresWithSynsets.items() if synsets} unigrams = [feature for feature in domainIndependentFeatures if "_" not in feature] for unigram in unigrams: synsets = wn.synsets(unigram) if synsets: self._featuresWithSynsets[unigram] = [str(synset) for synset in synsets] allSynsets = [synsets for sublist in self._featuresWithSynsets.values() for synsets in sublist] allSynsets = set([synset for synset in allSynsets if allSynsets.count(synset) >= minSyn]) self._featuresWithSynsets = {feature:set(synsets) & allSynsets for feature,synsets in self._featuresWithSynsets.items() if set(synsets) & allSynsets} self._featuresWithoutSynsets = sorted(set(domainIndependentFeatures) - set(self._featuresWithSynsets.keys())) return sorted(allSynsets) def _createCooccurrenceMatrix(self, domainIndependentFeatures, domainDependentFeatures): domainIndependentFeaturesSet = set(domainIndependentFeatures) domainDependentFeaturesSet = set(domainDependentFeatures) numSyn = len(self._allSynsets) def __parseFile(filePath): with open(filePath, "r") as f: for review in f: reviewFeatures = set([tupel.split(":")[0].decode("utf-8") for tupel in review.split()]) independentFeatures = reviewFeatures & domainIndependentFeaturesSet dependentFeatures = reviewFeatures & domainDependentFeaturesSet for dependentFeature in dependentFeatures: rowIndex = bisect_left(domainDependentFeatures,dependentFeature) for independentFeature in independentFeatures: if independentFeature in self._featuresWithSynsets: for synset in self._featuresWithSynsets[independentFeature]: matrix[rowIndex, bisect_left(self._allSynsets,synset)] += 1 else: matrix[rowIndex, bisect_left(self._featuresWithoutSynsets,independentFeature)+numSyn] += 1 matrix = np.zeros((len(domainDependentFeatures), len(self._featuresWithoutSynsets)+numSyn)) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._sourceDomain, "negative.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "positive.review")) __parseFile(path.join(self._rawDataFolder, self._targetDomain, "negative.review")) return sparse.coo_matrix(matrix) def _createSquareAffinityMatrix(self, cooccurrenceMatrix): height = np.size(cooccurrenceMatrix, 0) width = np.size(cooccurrenceMatrix, 1) topMatrix = sparse.coo_matrix((height,height)) topMatrix = sparse.hstack((topMatrix,cooccurrenceMatrix)) bottomMatrix = sparse.coo_matrix((width,width)) bottomMatrix = sparse.hstack((cooccurrenceMatrix.transpose(), bottomMatrix)) matrix = sparse.vstack((topMatrix, bottomMatrix)) return matrix def _createDiagonalMatrix(self, squareAffinityMatrix): rows = range(squareAffinityMatrix.get_shape()[0]) data = [0. if rowSum == 0 else np.sqrt(1.0 / rowSum) for rowSum in np.array(squareAffinityMatrix.sum(1)).reshape(-1,)] return sparse.coo_matrix((data,(rows,rows)),shape=(squareAffinityMatrix.get_shape()[0],squareAffinityMatrix.get_shape()[1])) def _createDocumentVectors(self,domainDependentFeatures, domainIndependentFeatures, domain): numDomainDep = len(domainDependentFeatures) numDomainIndep = len(domainIndependentFeatures) domainDepSet = set(domainDependentFeatures) domainIndepSet = set(domainIndependentFeatures) documentVectors = [] classifications = [] numSynsets = len(self._allSynsets) def __parseFile(filePath): with open(filePath,"r") as f: for review in f: classification = 1 if "#label#:positive" in review else -1 reviewList = [tupel.split(":") for tupel in review.split() if "#label#" not in tupel] reviewDict = {x[0].decode("utf-8"):int(x[1]) for x in reviewList} reviewFeatures = set(reviewDict.keys()) domainDepReviewFeatures = domainDepSet & reviewFeatures domainIndepReviewFeatures = domainIndepSet & reviewFeatures domainDepValues,domainDepIndizes = [],[] domainIndepValues, domainIndepIndizes = [],[] for feature in domainIndepReviewFeatures: if feature in self._featuresWithSynsets: for synset in self._featuresWithSynsets[feature]: domainIndepIndizes.append(bisect_left(self._allSynsets,synset)) domainIndepValues.append(1) else: domainIndepIndizes.append(bisect_left(self._featuresWithoutSynsets,feature)+numSynsets) domainIndepValues.append(1) #domainIndepValues.append(reviewDict[feature]) for feature in domainDepReviewFeatures: #domainDepValues.append(reviewDict[feature]) domainDepValues.append(1) domainDepIndizes.append(bisect_left(domainDependentFeatures,feature)) domainIndepVector = sparse.csr_matrix((domainIndepValues,(np.zeros(len(domainIndepIndizes)),domainIndepIndizes)), shape=(1,len(self._featuresWithoutSynsets)+numSynsets)) domainDepVector = sparse.csr_matrix((domainDepValues,(np.zeros(len(domainDepIndizes)),domainDepIndizes)),shape=(1,numDomainDep)) documentVectors.append((domainIndepVector,domainDepVector)) classifications.append(classification) __parseFile(path.join(self._rawDataFolder, domain, "positive.review")) __parseFile(path.join(self._rawDataFolder, domain, "negative.review")) return documentVectors,classifications def _trainClassifier(self, trainingVectors, classifications): self._lsvc.fit(sparse.vstack(trainingVectors),classifications) def _testClassifier(self,testVectors,classifications): return self._lsvc.score(sparse.vstack(testVectors),classifications) def go(self,K=100, Y=6, DI=500, minFreq=5, minSyn=10): print self._sourceDomain + " -> " + self._targetDomain domainIndependentFeatures, domainDependentFeatures = self._getFeatures(DI,minFreq) numDomainIndep = len(domainIndependentFeatures) numDomainDep = len(domainDependentFeatures) #print "number of independent features %i, number of dependent features %i" % (numDomainIndep, numDomainDep) #print "finding synsets..." self._allSynsets = self._getSynsets(domainIndependentFeatures, minSyn) print self._featuresWithSynsets for k,v in self._featuresWithSynsets.items(): print str(k) + " : " + str(v) if not self._allSynsets: return #print "creating cooccurrenceMatrix..." a = self._createCooccurrenceMatrix(domainIndependentFeatures, domainDependentFeatures) #print "creating SquareAffinityMatrix..." a = self._createSquareAffinityMatrix(a) #print "creating DiagonalMatrix..." b = self._createDiagonalMatrix(a) #print "multiplying..." c = b.dot(a) del a c = c.dot(b) del b #print "calculating eigenvalues and eigenvectors" eigenValues,eigenVectors = eigsh(c, k=K, which="LA") del c #print "building document vectors..." documentVectorsTraining,classifications = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._sourceDomain) documentVectorsTesting,classificatons = self._createDocumentVectors(domainDependentFeatures, domainIndependentFeatures,self._targetDomain) #print "training and testing..." U = [eigenVectors[:,x].reshape(np.size(eigenVectors,0),1) for x in eigenValues.argsort()[::-1]] U = np.concatenate(U,axis=1)[:numDomainDep] U = sparse.csr_matrix(U) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTraining] trainingVectors = [sparse.hstack((documentVectorsTraining[x][0],documentVectorsTraining[x][1],clustering[x])) for x in range(np.size(documentVectorsTraining,axis=0))] self._trainClassifier(trainingVectors,classifications) clustering = [vector[1].dot(U).dot(Y).astype(np.float64) for vector in documentVectorsTesting] testVectors = [sparse.hstack((documentVectorsTesting[x][0],documentVectorsTesting[x][1],clustering[x])) for x in range(np.size(documentVectorsTesting,axis=0))] print "accuracy: %.2f with K=%i AND DI=%i AND Y=%.1f AND minFreq=%i AND minSyn=%i" % (self._testClassifier(testVectors,classifications)*100,K,DI,Y,minFreq,minSyn)
print return score, train_time, test_time for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (KNeighborsClassifier(n_neighbors=10), "kNN")): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark( LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01))
X_train = X_train.toarray() X_test = X_test.toarray() # X = X.toarray() # X_den = X.toarray() n_samples, n_features = X_train.shape ############################################################################### # Test classifier on test dataset # clf = DecisionTreeClassifier(max_depth=14, min_split=5) # clf = MultinomialNB(alpha=.01) # clf = KNeighborsClassifier(n_neighbors=19) # clf = RidgeClassifier(tol=1e-1) clf = LinearSVC(loss='l2', penalty='l2', C=0.5, dual=False, tol=1e-3) # clf = SVC(C=32, gamma=0.0625) print clf t0 = time() clf.fit(X_train, y_train) print (time()-t0) t1 = time() pred = clf.predict(X_test) print (time()-t1) pre_score = metrics.precision_score(y_test, pred) rec_score = metrics.recall_score(y_test, pred) print "average f1-score: %0.2f" % (100*((2*pre_score*rec_score)/(pre_score+rec_score))) print "average f5-score: %0.2f" % (100*((1.25*pre_score*rec_score)/(0.25*pre_score+rec_score)))
# # Setup 10 fold cross validation fold_num = 10 kf = KFold(n_samples, k=fold_num, indices=True) # set number of neighbors for kNN n_neighb = 13 # Brute-force implementation clf_bNB = BernoulliNB(alpha=.01) clf_mNB = MultinomialNB(alpha=.01) clf_kNN = KNeighborsClassifier(n_neighbors=n_neighb) clf_ridge = RidgeClassifier(tol=1e-1) clf_SGD = SGDClassifier(alpha=.0001, n_iter=50, penalty="l2") clf_lSVC = LinearSVC(loss='l2', penalty='l2', C=1000, dual=False, tol=1e-3) clf_SVC = SVC(C=1024, kernel='rbf', degree=3, gamma=0.001, probability=True) ############################################################################### # Stacking # # initialize empty y and z print 'X_den shape: ', X_den.shape print 'y shape: ', y.shape n_categories = len(set(y)) z = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=float) # z = np.zeros( (n_samples, n_categories) , dtype=float)
categories = ['HUM', 'LOC', 'NUM', 'ENTY', 'DESC', 'ABBR'] train = load_files('coarse/', categories=categories, shuffle=True, random_state=42) # save train pickle filehandler = open('pickle_training_coarse.pkl', 'wb') pickle.dump(train, filehandler) filehandler.close() text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LinearSVC()), ]) _ = text_clf.fit(train.data, train.target) # save text_clf pickle filehandler = open('pickle_clf_coarse.pkl', 'wb') pickle.dump(text_clf, filehandler) filehandler.close() #new = ['Where is the Amazon river located?', # 'Where can I get a good sandwhich', # 'In what state was Columbus born?', # 'What is the best cheese?'] text = """
# split ~140k into ~100k training and ~40k test ff_train, ff_val = split_dataframe(test_ff) print("Training...") t1 = time() vectorizer = CountVectorizer() train_counts = vectorizer.fit_transform(ff_train["TitlePlusBody"]) tfidf_transformer = TfidfTransformer(use_idf=False) # 98190x285052 train_tfidf_table = tfidf_transformer.fit_transform(train_counts) clf = LinearSVC().fit(train_tfidf_table, ff_train["OpenStatus"]) print("Testing...") test_counts = vectorizer.transform(ff_val["TitlePlusBody"]) test_tfidf_table = tfidf_transformer.transform(test_counts) predict = clf.predict(test_tfidf_table) print("np.mean: %f" % (np.mean(predict == ff_val["OpenStatus"]))) linear_decisions = clf.decision_function(test_tfidf_table) predicted_probs = (1 / (1 + np.exp(- linear_decisions))) ** 3.5 print("MCLL: %f" % (mcll(predicted_probs, ff_val["OpenStatus"].values))) t2 = time() print("done in %d seconds" % (t2 - t1))