class TopicEmbeddingModel(): ''' Wrapper class for different topic models ''' def __init__(self,folder='model',modeltype='kpca',topics=10): # the classifier, which also contains the trained BoW transformer self.bow = Vectorizer(folder=folder,steps=['hashing','tfidf']) self.folder = folder self.modeltype = modeltype self.topics = topics if self.modeltype is 'kpca': from sklearn.decomposition import KernelPCA self.model = KernelPCA(kernel='rbf',gamma=1.,n_components=topics) if self.modeltype is 'nmf': from sklearn.decomposition import NMF self.model = NMF(n_components=topics) def fit(self,X): ''' fits a topic model INPUT X list of strings ''' # transform list of strings into sparse BoW matrix X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].fit_transform(\ # self.bow['count_vectorizer'].fit_transform(X)) # depending on the model, train if self.modeltype is 'kpca': Xc = self.model.fit_transform(X) if self.modeltype is 'nmf': Xc = self.model.fit_transform(X) def predict(self,X): ''' predicts cluster assignment from list of strings INPUT X list of strings ''' if X is not list: X = [X] X = self.bow.transform(X) #X = self.bow['tfidf_transformer'].transform(\ # self.bow['count_vectorizer'].transform(X)) if self.modeltype is 'kpca': return self.model.transform(X) if self.modeltype is 'nmf': return self.model.transform(X)
def main(): with timer("model loading"): # モデルとパイプラインの読込 model = ModelMLP() model.load_model() vectorizer = Vectorizer() vectorizer.load_vectorizer() with timer("data loading"): # 予測対象のデータをロード df = load_data_from_gcs() with timer("preprocess"): df = preprocess(df) with timer("predict"): X = df.drop(columns="price") X = vectorizer.transform(X) pred = model.predict(X) print(pred[:10])
class Trainer(object): """Trains the classifier with training data and does the cross validation. """ def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None def initialize_training_data(self): """Initializes all types of training data we have. """ corpus_file = open(os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv')) classification, tweets = parse_training_corpus(corpus_file) reviews_positive = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'positive')) num_postive_reviews = len(reviews_positive) class_positive = ['positive'] * num_postive_reviews reviews_negative = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'negative')) num_negative_reviews = len(reviews_negative) class_negative = ['negative'] * num_negative_reviews self.data = tweets self.classification = classification #self.date_time = date_time #self.retweet = retweets #self.favorited = favorited def initial_fit(self): """Initializes the vectorizer by doing a fit and then a transform. """ # We map the sentiments to the values specified in the SENTIMENT_MAP. # For any sentiment that is not part of the map we give a value 0. classification_vector = numpy.array(map( lambda s: SENTIMENT_MAP.get(s.lower(), 0), self.classification)) feature_vector = self.vectorizer.fit_transform(self.data) return (classification_vector, feature_vector) def build_word_dict(self): """ Build sentiment dictionary and build vector of weights for tweets. """ fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-96.txt')) wordDict = {} line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt')) line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() word_dict_vector = [] for tweet in self.data: word_list = tweet.split() sum = 0 for word in word_list: if word in wordDict.keys(): sum += wordDict[word] word_dict_vector.append(sum) return word_dict_vector def transform(self, test_data): """Performs the transform using the already initialized vectorizer. """ feature_vector = self.vectorizer.transform(test_data) def score_func(self, true, predicted): """Score function for the validation. """ return metrics.precision_recall_fscore_support( true, predicted, pos_label=[ SENTIMENT_MAP['positive'], SENTIMENT_MAP['negative'], SENTIMENT_MAP['neutral'], ], average='macro') def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv( k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification)) def train_and_validate(self, cross_validate=False, mean=False, serialize=False): """Trains the SVC with the training data and validates with the test data. We do a K-Fold cross validation with K = 10. """ self.classification_vector, self.feature_vector = self.initial_fit() self.classifier1 = naive_bayes.MultinomialNB() self.classifier2 = naive_bayes.BernoulliNB() self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1', C=1000,dual=False, tol=1e-3) if cross_validate: self.cross_validate(k=cross_validate) else: self.classifier1.fit(self.feature_vector, self.classification_vector) self.classifier2.fit(self.feature_vector, self.classification_vector) self.classifier3.fit(self.feature_vector, self.classification_vector) if serialize: classifiers_file = open(os.path.join( datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb') cPickle.dump([self.classifier1, self.classifier2, self.classifier3], classifiers_file) vectorizer_file = open(os.path.join( datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb') cPickle.dump(self.vectorizer, vectorizer_file) return self.scores def build_ui(self, mean=False): """Prints out all the scores calculated. """ for i, score in enumerate(self.scores): print "Cross Validation: %d" % (i + 1) print "*" * 40 if mean: print "Mean Accuracy: %f" % (score) else: print "Precision\tRecall\t\tF-Score" print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~" precision = score[0] recall = score[1] f_score = score[2] print "%f\t%f\t%f" % (precision, recall, f_score) print
class CRF: """Class for training and predicting CRF based named entity recognition. main API functions are: CRF.fit(): training using avg. perceptron CRF.inference: prediction using viterbi algorithm save, load weights and vector transformers """ def __init__(self): # vectorizer class # based on composition instead of inheritence principles self.vectorizer = Vectorizer() # weights learned and used by model self.weights = np.array([]) self.tag_enums = [] self.tag_dict = {} def fit(self, file_name, iterations=5): """ Wrapper function for initializing and training CRF model params: file_name: training data file in GermEval format iterations: number of iterations. default to 5""" tags, tokens = self.read_file(file_name) # fix this shit later # build essential indices maps for the vectorizer self.vectorizer.build_tag_map(tags) self.vectorizer.build_word_map(tokens) self.vectorizer.build_name_list("data/first_names.txt") ### EDIT HERE TO ADD FUNCTIONS ### # add feature functions here # # descriptions in vectorizer file # self.vectorizer.add_feature("word", self.vectorizer.sparse_feat_word_in, len(self.vectorizer.word_map)) self.vectorizer.add_feature("prev word", self.vectorizer.sparse_feat_prev_word, len(self.vectorizer.word_map)) self.vectorizer.add_feature( "word tag", self.vectorizer.sparse_feat_word_and_tag, len(self.vectorizer.word_map) * len(self.vectorizer.tag_map), True) self.vectorizer.add_feature("DE name gazetter", self.vectorizer.sparse_feat_in_names, 2) self.vectorizer.add_feature("Caps", self.vectorizer.sparse_feat_is_all_cap, 2) self.vectorizer.add_feature("hyphenated", self.vectorizer.sparse_feat_hyphenated, 2) # tag transitions must be added last so that the Viterbi can know where to look self.vectorizer.add_feature( "prev tag", self.vectorizer.sparse_feat_prev_tag, len(self.vectorizer.tag_map) * len(self.vectorizer.tag_map), True) # fit vectorizer self.vectorizer.fit(tokens, tags) #initialize weight self.tag_enums = list(enumerate(self.vectorizer.tag_list)) self.tag_dict = {word: idx for idx, word in self.tag_enums} self.initialize_weights(self.vectorizer.vector_size + 1) # perceptron train self.train(tokens, tags, iterations) print("Classifier Fitted") def predict(self, file_name): """ Wrapper for viterbi inference. Takes filename in GermEval format returns predicted tag sequence and actual tag seq in that order. returned in a flattened way""" tags, tokens = self.read_file(file_name) predicted_list = [] actual_list = [] for token_seq, tag_seq in tqdm(list(zip(tokens, tags))): predicted_tags = self.inference(token_seq) predicted_list.extend(predicted_tags) actual_list.extend(tag_seq) return predicted_list, actual_list def read_file(self, fname): """ GermEval file parser returns list of sequences of both tags and tokens """ tag_seq = [] tok_seq = [] curr_tok = [] curr_tag = [] with open(fname, 'r') as df: for line in df: line = line.strip().split("\t") if len(line) < 2: tag_seq.append(curr_tag) tok_seq.append(curr_tok) curr_tok = [] curr_tag = [] else: if line[0] == '#': #print("annot") pass else: curr_tok.append(line[1]) curr_tag.append(line[2]) return tag_seq, tok_seq def inference(self, token_seq, feats_list=False, int_tags=False): """ Viterbi Algorithm for decoding. Takes list of tokens and returns either list of predicted tags or additionally list of feature vectors """ # check input for empty sequence if len(token_seq) < 1: #print("invalid input encountered: empty tokens") return [], [] tag_len = len(self.vectorizer.tag_list) seq_len = len(token_seq) # initialize viterbi/ backpointer charts #### change this shit viterbi_chart = np.zeros((seq_len, tag_len)) bp_chart = np.full((seq_len, tag_len), -1) feature_chart = [[{} for j in range(tag_len)] for i in range(seq_len)] # initialize first trellis for i, tag in self.tag_enums: viterbi_chart[0][i] = self.vectorizer.feature_dot( token_seq, tag, 0, self.weights) feature_chart[0][i] = self.vectorizer.join_features( token_seq, [tag], 0, 0) # for each word for i in range(1, seq_len): #for each state for j, tag_1 in self.tag_enums: best_val = -1000000000000000000 idx = -1 # argmax # go through states with known transition tag_curr_id = self.tag_dict[ tag_1] * self.vectorizer.tag_sentinel + self.vectorizer.partitions[ -1] for tag_2 in self.vectorizer.tag2tag[tag_1]: ind_tag_2 = self.tag_dict[tag_2] ind = tag_curr_id + ind_tag_2 vs = viterbi_chart[i - 1][ind_tag_2] + self.weights[ind] if vs > best_val: best_val = vs idx = ind_tag_2 # update charts` bp_chart[i][j] = idx tag_2 = self.tag_enums[idx][1] # feature feature = self.vectorizer.feature(token_seq, tag_1, tag_2, i) #scal_prod = self.vectorizer.sparse_dot(self.weights, feature) viterbi_chart[i][j] = viterbi_chart[i - 1][idx] + sum( [val * self.weights[k] for k, val in feature.items()]) feature_chart[i][j] = feature # find max and initialize backtrace best = np.argmax(viterbi_chart[seq_len - 1]) # deque to append first res = deque() feat_vs = deque() res.append(best) feat_vs.append(feature_chart[len(token_seq) - 1][best]) # extract path for i in range(seq_len - 1, 0, -1): # res.appendleft( self.vectorizer.tag_list[ int(bp_chart[i][int(best)]) ]) res.appendleft(bp_chart[i][int(best)]) feat_vs.appendleft(feature_chart[i][int(best)]) best = bp_chart[i][int(best)] feat_vs.appendleft(feature_chart[0][int(best)]) if int_tags == False: res = [self.vectorizer.tag_list[int(w)] for w in res] if feats_list: return list(res), feat_vs else: return res def get_wrong_tags(self, y, tags): res = [] #idx = 0 for i in range(len(tags)): if y[i] != tags[i]: res.append(i) return res def train(self, tok_seq, tag_seq, iters=5, learning_rate=1): """ Train CRF model using avg. Perceptron algorithm. takes list of token/tag sequences and attempts to learn something useful. learning rate can be set, but probably not that useful. iters set the number of iteration default to 5 """ avg_weights = self.weights # zip training data to allow it to shuffle # convert to list because shuffling won't work otherwise new_tags = [[self.vectorizer.tag_map[tg] for tg in tg_list] for tg_list in tag_seq] train_data = list(enumerate(zip(tok_seq, new_tags))) num_words = sum([len(seq) for seq in tok_seq]) num_samples = len(tok_seq) # pre calculate gold vectors gold_data = self.vectorizer.transform(tok_seq, tag_seq) # print(gold_data) # iterative loop for i in range(iters): # epoch timer #start = time() shuffle(train_data) print("starting epoch:", i + 1) wrong = 0 for idx, (tokens, tags) in tqdm(train_data): y, y_feats = self.inference(tokens, feats_list=True, int_tags=True) #wrong_tags = [ind for ind,pair in enumerate(zip(y,tags)) if pair[0] != pair[1]] wrong_tags = self.get_wrong_tags(y, tags) # if predicted wrong if len(wrong_tags) > 0: # collect wrong for accuracy displayed after epoch wrong += len(wrong_tags) predicted = fn.reduce(self.vectorizer.sum_features, [y_feats[i] for i in wrong_tags]) gold_wrong = [gold_data[idx][i] for i in wrong_tags] gold = fn.reduce(self.vectorizer.sum_features, gold_wrong) diff = self.vectorizer.subtract_features(gold, predicted) self.vectorizer.add_weights(avg_weights, diff, lr=learning_rate) #end = time() #print("epoch time", end - start) print("accuracy:", (num_words - wrong) / num_words) # average self.weights = avg_weights / (num_samples * iters) def initialize_weights(self, size, fill=0): # dtype to double just in case # but normal float or half could also work self.weights = np.full((size, ), fill, dtype=np.float64) def save_weights(self, weights_fname, vectorizer_fname): """ save to binary. arg1: name of weight filename arg2: name of vectorizer filename""" with open(weights_fname, "wb") as w_file: np.save(w_file, self.weights) with open(vectorizer_fname, "wb") as vec_file: pickle.dump(self.vectorizer, vec_file) def load_weights(self, weights_fname, vectorizer_fname): """ load from binary. arg1: name of weight filename arg2: name of vectorizer filename""" with open(weights_fname, "rb") as w_file: self.weights = np.load(w_file) with open(vectorizer_fname, "rb") as vec_file: self.vectorizer = pickle.load(vec_file)
def test_with_nested_CV(folder='model',folds=5, plot=True, steps=['hashing','tfidf']): ''' Evaluates the classifer by doing nested CV i.e. keeping 1/folds of the data out of the training and doing training (including model selection for regularizer) on the training set and testing on the held-out data Also prints some stats and figures INPUT folder folder with model files folds number of folds ''' # start timer import time t0 = time.time() # create bag of words representations vv = Vectorizer(steps=steps) # load data vec = Vectorizer(folder=folder) data = get_speech_text(folder=folder) for key in data.keys(): data[key] = vec.transform(data[key]) # create numerical labels Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data)))) # create data matrix X = vstack(data.values()) # permute data fsize = len(Y)/folds randidx = permutation(len(Y)) Y = Y[randidx] X = X[randidx,:] idx = reshape(arange(fsize*folds),(folds,fsize)) Y = Y[:fsize*folds] # allocate matrices for predictions predicted = zeros(fsize*folds) predicted_prob = zeros((fsize*folds,len(data))) # the regularization parameters to choose from parameters = {'C': (10.**arange(-4,4,1.)).tolist()} # do nested CV for ifold in range(folds): testidx = idx[ifold,:] trainidx = idx[setdiff1d(arange(folds),ifold),:].flatten() text_clf = LogisticRegression(class_weight='auto',dual=True) # for nested CV, do folds-1 CV for parameter optimization # within inner CV loop and use the outer testfold as held-out data # for model validation gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=(folds-1)) gs_clf.fit(X[trainidx,:],Y[trainidx]) predicted[testidx] = gs_clf.predict(X[testidx,:]) predicted_prob[testidx,:] = gs_clf.predict_proba(X[testidx,:]) print '************ Fold %d *************'%(ifold+1) print metrics.classification_report(Y[testidx], predicted[testidx],target_names=data.keys()) t1 = time.time() total_time = t1 - t0 timestr = 'Wallclock time: %f sec\n'%total_time dimstr = 'Vocabulary size: %d\n'%X.shape[-1] report = timestr + dimstr # extract some metrics print '********************************' print '************ Total *************' print '********************************' report += metrics.classification_report(Y, predicted,target_names=data.keys()) # dump metrics to file open(folder+'/report_%s.txt'%'_'.join(sorted(steps)),'wb').write(report) print(report) conf_mat = metrics.confusion_matrix(Y,predicted) open(folder+'/conf_mat_%s.txt'%'_'.join(sorted(steps)),'wb').write(json.dumps(conf_mat.tolist())) print(conf_mat) if plot: # print confusion matrix import pylab pylab.figure(figsize=(16,16)) pylab.imshow(metrics.confusion_matrix(Y,predicted),interpolation='nearest') pylab.colorbar() pylab.xticks(arange(4),[x.decode('utf-8') for x in data.keys()]) pylab.yticks(arange(4),[x.decode('utf-8') for x in data.keys()]) pylab.xlabel('Predicted') pylab.ylabel('True') font = {'family' : 'normal', 'size' : 30} pylab.rc('font', **font) pylab.savefig(folder+'/conf_mat.pdf',bbox_inches='tight')
class Classifier: def __init__(self,folder='model',train=False): ''' Creates a classifier object if no model is found, or train is set True, a new classifier is learned INPUT folder the root folder with the Bag-of-Word data, where the model is stored train set True if you want to train ''' self.folder = folder # load Bag-of-Word extractor self.bow_vectorizer = Vectorizer(self.folder) # if there is no classifier file or training is invoked if (not os.path.isfile(self.folder+'/classifier.pickle')) or train: print 'Training classifier' self.train() print 'Loading classifier' clfdict = cPickle.load(open(self.folder+'/classifier.pickle')) self.clf = clfdict['classifier'] self.parties = clfdict['labels'] def predict(self,text): ''' Loads scikit-learn Bag-of-Word extractor and classifier and applies it to some text. INPUT text a string to assign to a party folder the folder containing the classifier and bag-of-words transformer pickles ''' # transform string into sparse matrix x = self.bow(text) # predict probabilities of each party probabilities = self.clf.predict_proba(x) # transform the predictions into json output result = {'text':text,'prediction':[]} # the classifier returns parties in alphabetical order, so we reorder for pidx in range(len(self.parties)): result['prediction'].append( { 'party':self.parties[pidx], 'probability':probabilities.flatten()[pidx] }) return result def bow(self,text): if type(text) is not list: text = [text] return self.bow_vectorizer.transform(text) def train(self,folds = 2): ''' trains a classifier on the bag of word vectors extracted with extract_bundestag speeches.py INPUT folder the folder to store the model file and load the bag-of-words-vectorizer file folds number of cross-validation folds for optimizing the regularizer of the classifier ''' try: # load the data data = get_speech_text(folder=self.folder) for key in data: data[key] = self.bow(data[key]) except: print('Could not load text data file in\n' + \ 'Try executing [python downloader.py --download --parse]') raise # create numerical labels for each party Y = hstack(map((lambda x: ones(data[data.keys()[x]].shape[0])*x),range(len(data)))) # create the data matrix X = vstack(data.values()) # estimate fold size (if not a divisor of total samples) fsize = len(Y)/folds # permute data indices for training randidx = permutation(len(Y)) Y = Y[randidx] X = X[randidx,:] # the classifier, accounting for unbalanced classes text_clf = LogisticRegression(class_weight='auto',dual=True) # the regularizer parameters = {'C': (10.**arange(-5,5,1.)).tolist()} # perform gridsearch to get the best regularizer gs_clf = GridSearchCV(text_clf, parameters, cv=folds, n_jobs=-1,verbose=2) gs_clf.fit(X,Y) print "Classifier reached mean %0.2f accuracy with regularizer: %f"%(gs_clf.best_score_, gs_clf.best_params_['C']) # dump classifier to pickle cPickle.dump({'classifier':gs_clf,'labels':data.keys()},open(self.folder+'/classifier.pickle','wb'),-1)
class Trainer(object): """Trains the classifier with training data and does the cross validation. """ def __init__(self): """Initializes the datastructures required. """ # The actual text extraction object (does text to vector mapping). self.vectorizer = Vectorizer() # A list of already hand classified tweets to train our classifier. self.data = None # A list containing the classification to each individual tweet # in the tweets list. self.classification = None self.classifier = None self.scores = None def initialize_training_data(self): """Initializes all types of training data we have. """ corpus_file = open( os.path.join(datasettings.DATA_DIRECTORY, 'full-corpus.csv')) classification, tweets = parse_training_corpus(corpus_file) reviews_positive = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'positive')) num_postive_reviews = len(reviews_positive) class_positive = ['positive'] * num_postive_reviews reviews_negative = parse_imdb_corpus( os.path.join(datasettings.DATA_DIRECTORY, 'negative')) num_negative_reviews = len(reviews_negative) class_negative = ['negative'] * num_negative_reviews self.data = tweets self.classification = classification #self.date_time = date_time #self.retweet = retweets #self.favorited = favorited def initial_fit(self): """Initializes the vectorizer by doing a fit and then a transform. """ # We map the sentiments to the values specified in the SENTIMENT_MAP. # For any sentiment that is not part of the map we give a value 0. classification_vector = numpy.array( map(lambda s: SENTIMENT_MAP.get(s.lower(), 0), self.classification)) feature_vector = self.vectorizer.fit_transform(self.data) return (classification_vector, feature_vector) def build_word_dict(self): """ Build sentiment dictionary and build vector of weights for tweets. """ fileIn = open(os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-96.txt')) wordDict = {} line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() fileIn = open( os.path.join(datasettings.DATA_DIRECTORY, 'AFINN-111.txt')) line = fileIn.readline() while line != '': temp = string.split(line, '\t') wordDict[temp[0]] = int(temp[1]) line = fileIn.readline() fileIn.close() word_dict_vector = [] for tweet in self.data: word_list = tweet.split() sum = 0 for word in word_list: if word in wordDict.keys(): sum += wordDict[word] word_dict_vector.append(sum) return word_dict_vector def transform(self, test_data): """Performs the transform using the already initialized vectorizer. """ feature_vector = self.vectorizer.transform(test_data) def score_func(self, true, predicted): """Score function for the validation. """ return metrics.precision_recall_fscore_support( true, predicted, pos_label=[ SENTIMENT_MAP['positive'], SENTIMENT_MAP['negative'], SENTIMENT_MAP['neutral'], ], average='macro') def cross_validate(self, k=10): """Performs a k-fold cross validation of our training data. Args: k: The number of folds for cross validation. """ self.scores = [] X, y = check_arrays(self.feature_vector, self.classification_vector, sparse_format='csr') cv = cross_validation.check_cv(k, self.feature_vector, self.classification_vector, classifier=True) for train, test in cv: self.classifier1.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier2.fit(self.feature_vector[train], self.classification_vector[train]) self.classifier3.fit(self.feature_vector[train], self.classification_vector[train]) classification1 = self.classifier1.predict( self.feature_vector[test]) classification2 = self.classifier2.predict( self.feature_vector[test]) classification3 = self.classifier3.predict( self.feature_vector[test]) classification = [] for predictions in zip(classification1, classification2, classification3): neutral_count = predictions.count(0) positive_count = predictions.count(1) negative_count = predictions.count(-1) if (neutral_count == negative_count and negative_count == positive_count): classification.append(predictions[0]) elif (neutral_count > positive_count and neutral_count > negative_count): classification.append(0) elif (positive_count > neutral_count and positive_count > negative_count): classification.append(1) elif (negative_count > neutral_count and negative_count > positive_count): classification.append(-1) classification = numpy.array(classification) self.scores.append(self.score_func(y[test], classification)) def train_and_validate(self, cross_validate=False, mean=False, serialize=False): """Trains the SVC with the training data and validates with the test data. We do a K-Fold cross validation with K = 10. """ self.classification_vector, self.feature_vector = self.initial_fit() self.classifier1 = naive_bayes.MultinomialNB() self.classifier2 = naive_bayes.BernoulliNB() self.classifier3 = svm.LinearSVC(loss='l2', penalty='l1', C=1000, dual=False, tol=1e-3) if cross_validate: self.cross_validate(k=cross_validate) else: self.classifier1.fit(self.feature_vector, self.classification_vector) self.classifier2.fit(self.feature_vector, self.classification_vector) self.classifier3.fit(self.feature_vector, self.classification_vector) if serialize: classifiers_file = open( os.path.join(datasettings.DATA_DIRECTORY, 'classifiers.pickle'), 'wb') cPickle.dump( [self.classifier1, self.classifier2, self.classifier3], classifiers_file) vectorizer_file = open( os.path.join(datasettings.DATA_DIRECTORY, 'vectorizer.pickle'), 'wb') cPickle.dump(self.vectorizer, vectorizer_file) return self.scores def build_ui(self, mean=False): """Prints out all the scores calculated. """ for i, score in enumerate(self.scores): print "Cross Validation: %d" % (i + 1) print "*" * 40 if mean: print "Mean Accuracy: %f" % (score) else: print "Precision\tRecall\t\tF-Score" print "~~~~~~~~~\t~~~~~~\t\t~~~~~~~" precision = score[0] recall = score[1] f_score = score[2] print "%f\t%f\t%f" % (precision, recall, f_score) print
class Classifier: def __init__(self, folder='model', train=False): ''' Creates a classifier object if no model is found, or train is set True, a new classifier is learned INPUT folder the root folder with the Bag-of-Word data, where the model is stored train set True if you want to train ''' self.folder = folder # load Bag-of-Word extractor self.bow_vectorizer = Vectorizer(self.folder) # if there is no classifier file or training is invoked if (not os.path.isfile(self.folder + '/classifier.pickle')) or train: print 'Training classifier' self.train() print 'Loading classifier' clfdict = cPickle.load(open(self.folder + '/classifier.pickle')) self.clf = clfdict['classifier'] self.parties = clfdict['labels'] def predict(self, text): ''' Loads scikit-learn Bag-of-Word extractor and classifier and applies it to some text. INPUT text a string to assign to a party folder the folder containing the classifier and bag-of-words transformer pickles ''' # transform string into sparse matrix x = self.bow(text) # predict probabilities of each party probabilities = self.clf.predict_proba(x) # transform the predictions into json output result = {'text': text, 'prediction': []} # the classifier returns parties in alphabetical order, so we reorder for pidx in range(len(self.parties)): result['prediction'].append({ 'party': self.parties[pidx], 'probability': probabilities.flatten()[pidx] }) return result def bow(self, text): if type(text) is not list: text = [text] return self.bow_vectorizer.transform(text) def train(self, folds=2): ''' trains a classifier on the bag of word vectors extracted with extract_bundestag speeches.py INPUT folder the folder to store the model file and load the bag-of-words-vectorizer file folds number of cross-validation folds for optimizing the regularizer of the classifier ''' try: # load the data data = get_speech_text(folder=self.folder) for key in data: data[key] = self.bow(data[key]) except: print('Could not load text data file in\n' + \ 'Try executing [python downloader.py --download --parse]') raise # create numerical labels for each party Y = hstack( map((lambda x: ones(data[data.keys()[x]].shape[0]) * x), range(len(data)))) # create the data matrix X = vstack(data.values()) # estimate fold size (if not a divisor of total samples) fsize = len(Y) / folds # permute data indices for training randidx = permutation(len(Y)) Y = Y[randidx] X = X[randidx, :] # the classifier, accounting for unbalanced classes text_clf = LogisticRegression(class_weight='auto', dual=True) # the regularizer parameters = {'C': (10.**arange(-5, 5, 1.)).tolist()} # perform gridsearch to get the best regularizer gs_clf = GridSearchCV(text_clf, parameters, cv=folds, n_jobs=-1, verbose=2) gs_clf.fit(X, Y) print "Classifier reached mean %0.2f accuracy with regularizer: %f" % ( gs_clf.best_score_, gs_clf.best_params_['C']) # dump classifier to pickle cPickle.dump({ 'classifier': gs_clf, 'labels': data.keys() }, open(self.folder + '/classifier.pickle', 'wb'), -1)
def test_with_nested_CV(folder='model', folds=5, plot=True, steps=['hashing', 'tfidf']): ''' Evaluates the classifer by doing nested CV i.e. keeping 1/folds of the data out of the training and doing training (including model selection for regularizer) on the training set and testing on the held-out data Also prints some stats and figures INPUT folder folder with model files folds number of folds ''' # start timer import time t0 = time.time() # create bag of words representations vv = Vectorizer(steps=steps) # load data vec = Vectorizer(folder=folder) data = get_speech_text(folder=folder) for key in data.keys(): data[key] = vec.transform(data[key]) # create numerical labels Y = hstack( map((lambda x: ones(data[data.keys()[x]].shape[0]) * x), range(len(data)))) # create data matrix X = vstack(data.values()) # permute data fsize = len(Y) / folds randidx = permutation(len(Y)) Y = Y[randidx] X = X[randidx, :] idx = reshape(arange(fsize * folds), (folds, fsize)) Y = Y[:fsize * folds] # allocate matrices for predictions predicted = zeros(fsize * folds) predicted_prob = zeros((fsize * folds, len(data))) # the regularization parameters to choose from parameters = {'C': (10.**arange(-4, 4, 1.)).tolist()} # do nested CV for ifold in range(folds): testidx = idx[ifold, :] trainidx = idx[setdiff1d(arange(folds), ifold), :].flatten() text_clf = LogisticRegression(class_weight='auto', dual=True) # for nested CV, do folds-1 CV for parameter optimization # within inner CV loop and use the outer testfold as held-out data # for model validation gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=(folds - 1)) gs_clf.fit(X[trainidx, :], Y[trainidx]) predicted[testidx] = gs_clf.predict(X[testidx, :]) predicted_prob[testidx, :] = gs_clf.predict_proba(X[testidx, :]) print '************ Fold %d *************' % (ifold + 1) print metrics.classification_report(Y[testidx], predicted[testidx], target_names=data.keys()) t1 = time.time() total_time = t1 - t0 timestr = 'Wallclock time: %f sec\n' % total_time dimstr = 'Vocabulary size: %d\n' % X.shape[-1] report = timestr + dimstr # extract some metrics print '********************************' print '************ Total *************' print '********************************' report += metrics.classification_report(Y, predicted, target_names=data.keys()) # dump metrics to file open(folder + '/report_%s.txt' % '_'.join(sorted(steps)), 'wb').write(report) print(report) conf_mat = metrics.confusion_matrix(Y, predicted) open(folder + '/conf_mat_%s.txt' % '_'.join(sorted(steps)), 'wb').write(json.dumps(conf_mat.tolist())) print(conf_mat) if plot: # print confusion matrix import pylab pylab.figure(figsize=(16, 16)) pylab.imshow(metrics.confusion_matrix(Y, predicted), interpolation='nearest') pylab.colorbar() pylab.xticks(arange(4), [x.decode('utf-8') for x in data.keys()]) pylab.yticks(arange(4), [x.decode('utf-8') for x in data.keys()]) pylab.xlabel('Predicted') pylab.ylabel('True') font = {'family': 'normal', 'size': 30} pylab.rc('font', **font) pylab.savefig(folder + '/conf_mat.pdf', bbox_inches='tight')