コード例 #1
0
    def run_experiment(self):
        path = '/home/hmayun/PycharmProjects/create-dataset-r/segment-level-7-categories/'
        database = 'mmhsct'
        # database = 'srft'

        for random_state in self.random_states:
            training_file = path + database + '_segments_train_' + str(
                random_state) + '.csv'
            test_file = path + database + '_segments_test_' + str(
                random_state) + '.csv'

            training_data = self.utilities.read_from_csv(training_file)
            X_train = []
            y_train = []

            for row in training_data:
                X_train.append(row[0])
                y_train.append(row[1])

            test_data = self.utilities.read_from_csv(test_file)
            X_test = []
            y_test = []

            for row in test_data:
                X_test.append(row[0])
                y_test.append(row[1])

            model = self.aspect_classifier.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            print clsr(y_test, y_pred, digits=4)
            print accuracy_score(y_test, y_pred)
コード例 #2
0
ファイル: lib.py プロジェクト: RisenAgain/sms-classification
def with_wv(model, Xtr, ytr, Xte, yte):
    """
    Experimental method to test the word vector feature
    """
    word_vectors = KeyedVectors.load_word2vec_format(
        '~/GoogleNews-vectors-negative300.bin', binary=True)  # C binary format
    prep = NLTKPreprocessor(stem=False)
    vect = TfidfVectorizer(preprocessor=prep,
                           lowercase=True,
                           stop_words='english',
                           ngram_range=(1, 2))
    X_tr_mat = vect.fit_transform(Xtr["Message"])
    X_te_mat = vect.transform(Xte["Message"])
    vocab = vect.get_feature_names()
    imp_words = list(zip(*analysis(model, Xte, yte)))
    #imp_indices = list(imp_words[2])
    #vocab = np.array(vocab)
    words = list(imp_words[0])
    imp_names = extract_full_words(prep, words, vocab)
    X_tr_mat = model.named_steps['vect'].transform(Xtr["Message"])
    X_te_mat = model.named_steps['vect'].transform(Xte["Message"])

    vectors_train = generate_word_vectors_avg(word_vectors, imp_names,
                                              Xtr["Message"])
    vectors_test = generate_word_vectors_avg(word_vectors, imp_names,
                                             Xte["Message"])

    X_tr_mat = sp.sparse.hstack((X_tr_mat, vectors_train), format='csr')
    X_te_mat = sp.sparse.hstack((X_te_mat, vectors_test), format='csr')
    cls = model.named_steps['classif']
    cls.fit(X_tr_mat, ytr)
    y_pred = (cls.predict(X_te_mat))
    print(clsr(yte, y_pred))
コード例 #3
0
 def evaluate_on_test_split(self, X_test, y_test, verbose=True):
     if verbose:
         print("Classification Report:\n")
     y_pred = self.model.predict(X_test)
     print(clsr(y_test, y_pred))
     score = accuracy_score(y_test, y_pred)
     return y_pred, score
コード例 #4
0
def buildnEvaluateModel(X, y):
    '''
    The function takes training data and splits it further into
    Training and Cross-validate sets. And returns the model.
    '''
    # Split the traning data input to get 20% cross-validation data set
    # for model evaluation
    X_train, X_cv, y_train, y_cv = tts(X, y, test_size=0.2)

    #convert dataframe with float valaues into bool
    y_train = [bool(int(i)) for i in y_train]
    y_cv = [bool(int(i)) for i in y_cv]

    #output classification labels
    labels = LabelEncoder()
    labels.fit_transform(y_train)

    # define classification model
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SVC(kernel='linear', probability=True)),
    ])

    #Traning the model
    text_clf = text_clf.fit(X_train, y_train)
    '''
    Following section evaluates the model performance
    '''
    predicted = text_clf.predict(X_cv)
    print("Model Accuracy = " + str(np.mean(predicted == y_cv)))
    print(clsr(y_cv, predicted,
               target_names=[str(i) for i in labels.classes_]))

    return text_clf
コード例 #5
0
def build_and_evaluate(X_train, X_test, y_train, y_test, outpath=None):
    # Create tokenizer
    tokenizer = BertTokenizer()
    tokenizer.setup()

    # SVM Classifier
    svm = Sklearn_SVM()
    svm = svm.setup()

    train_tokenized = tokenizer.process_batch(X_train)

    print("Training model......")
    svm.fit(train_tokenized, y_train)

    print("Predicting test data...")
    test_tokenized = tokenizer.process_batch(X_test)
    y_pred = svm.process_batch(test_tokenized)

    print("Classification Report:\n")
    print(clsr(y_test, y_pred, target_names=set(y)))

    if outpath:
        with open(outpath, "wb") as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return svm
コード例 #6
0
def build_and_evaluate(X, y, classifier=svm.SVC, verbose=True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            (
                'union',
                FeatureUnion(transformer_list=[
                    (
                        'bag_words',
                        Pipeline([
                            ('preprocessor', NLTKPreprocessor()),
                            #('tfidf', TfidfVectorizer(ngram_range=(1, 2), tokenizer=identity, preprocessor=None, lowercase=False)),
                            #('tfidf', TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, max_df=0.5, stop_words='english')),
                            (
                                'topics_and_ngrams',
                                FeatureUnion(transformer_list=[
                                    ('grams',
                                     Pipeline([(
                                         'ngram',
                                         TfidfVectorizer(ngram_range=(1, 2),
                                                         tokenizer=identity,
                                                         preprocessor=None,
                                                         lowercase=False)
                                     ), ('best',
                                         TruncatedSVD(n_components=50))])),
                                    #('topics', Pipeline([
                                    #	('tfid', TfidfVectorizer(ngram_range=(1, 1), tokenizer=identity, preprocessor=None, lowercase=False)),
                                    #	('topic', NMF(n_components=9, random_state=1,
                                    #	alpha=.1, l1_ratio=.5)),
                                    #	])),
                                ])),
                        ])),
                    # add other features here as an element in transformer list
                    ('capitalize',
                     Pipeline([('cap_words', CaptilizationExtractor())])),
                    ('punctuation', PuncuationExtractor())
                    #('emotion', Pipeline([
                    # ('emotion_words', EmotionExtractor())
                    #]))
                ])),
            ('svc', svm.SVC()),
        ])
        model.fit(X, y)
        return model

    labels = LabelEncoder()
    y = labels.fit_transform(y)

    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model = build(classifier, X_train, y_train)

    if verbose:

        print("classification Report: \n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred))
コード例 #7
0
def getTrainingErrors():
		filename=OUTPUT_DIR+'annotatedThreads.pickle'
		if os.path.exists(filename):
			print ("%s Reading already annotated posts from disk"%(datetime.now().strftime(tsFormat)))
			annotated=pickle.load(open(filename))
		else:
			print ("%s ERROR. Annotation file not foud"%(datetime.now().strftime(tsFormat)))
			exit()
		print("%s Predicting annotated"%datetime.now().strftime(tsFormat))
		X_test=annotated.keys()
		y_test=annotated.values()
		y_pred = model.predict(X_test)
		labels=model.labels_
		y_test = labels.fit_transform(y_test)
		print (clsr(y_test, y_pred, target_names=labels.classes_))
		tn, fp, fn, tp= confusion_matrix(y_test,y_pred).ravel()
		print (" ---------------------")
		print ("        Predicted")
		print ("          'o' \t'p'")
		print ("        --------------")
		print ("Real 'o' | %s \t%s"%(tn,fp))
		print ("     'p' | %s \t%s"%(fn,tp))
		print
		print ("FPR:%.3f"%(float(fp)/(fp+tp)))
		print ("TPR:%.3f"%(float(tp)/(tp+fn)))
		print ("ACC:%.3f"%((float(tp)+tn)/(tp+fn+tn+fp)))
コード例 #8
0
ファイル: sentiment_training.py プロジェクト: BIotBot/BlotBot
def build_and_eval(chat_log_file, model_path, classifier=None):
    if classifier is None:
        classifier = SGDClassifier()

    print("preprocessing corpus")
    X, y = load_corpus(chat_log_file)

    vectorizer = create_vectorizer()
    vectorizer.fit(X)
    X = vectorizer.transform(X, copy=False)

    labels = LabelEncoder()
    y = labels.fit_transform(y)

    print("training test model")
    y_pred = cross_val_predict(classifier, X, y, cv=5)
    with open('data/y_pred.pkl', 'wb') as pred_file:
        pickle.dump(y_pred, pred_file)

    print(clsr(y, y_pred))

    print("training final model")
    model = classifier.fit(X, y)
    model.labels_ = labels

    with open(model_path, 'wb') as model_file:
        pickle.dump(model, model_file)

    return model
コード例 #9
0
def build_and_evaluate(X,
                       y,
                       classifier=SGDClassifier,
                       outpath=None,
                       verbose=True):
    # @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    secs = time()

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model = build(classifier, X_train, y_train)

    if verbose:
        print("Evaluation model fit in {:0.3f} seconds".format(time() - secs))
        print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    secs = time()
    if verbose:
        print("Building complete model and saving ...")
    model = build(classifier, X, y)
    model.labels_ = labels

    if verbose:
        print("Complete model fit in {:0.3f} seconds".format(time() - secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
コード例 #10
0
ファイル: lib.py プロジェクト: RisenAgain/sms-classification
def analysis(model, X_test, y_test):
    clf = model.named_steps['classif']
    voc = model.named_steps['vect'].vocabulary_

    y_pred = model.predict(X_test["Message"])
    print(confusion_matrix(y_test, y_pred))
    print(clsr(y_test, y_pred))

    sr = feature_importances(clf, voc)
    if sr is not None:
        print("top and bottom 7 features")
        print(sr)
        return sr
コード例 #11
0
def build_and_save_model(X, y, filepath):
    """
    This function does the following:
    - Build a classifier (SGD)
    - Fit our data to the classifier
    - Run cross validation to test the accuracy of our model
    """
    def build(classifier, X, y=None):
        """
        Build a model based on our process, a vectorizer and a linear classifier
        """
        if isinstance(classifier, type):
            classifier = classifier()

        model = Pipeline([
            ('preprocessor', DataPreProcessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)  # Fit the model to our data
        return model

    # Label encode the classes we chose
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Split data into train/test
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.1)
    model = build(SGDClassifier, X_train, y_train)

    # Predict the results of test data and calculate accuracy
    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    model.labels_ = labels

    with open(filepath, 'wb') as f:
        pickle.dump(model, f)

    return model
コード例 #12
0
    def evaluate_classifier(self, classifier, X, y):

        # Begin evaluation
        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=5)
        model = classifier.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        # *** save info for error analysis
        errors = []
        for index in range(0, len(X_test)):
            if y_test[index] != y_pred[index]:
                errors.append("\""+X_test[index] +"\",\""+ y_test[index]  +"\",\""+ y_pred[index]+"\"")

        str_out = "\n".join(errors)
        self.utilities.write_content_to_file('aspect_errors.csv', str_out)


        print(clsr(y_test, y_pred))
コード例 #13
0
def build_model(X, y, classifier, verbose=True):
    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model, secs = build(classifier, X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    if verbose: print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels.inverse_transform(model.classes_)

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    return model
コード例 #14
0
def build_and_evaluate(text, leanings, classifier=SGDClassifier, verbose=True):
    def build(classifier, X, y=None):
        if isinstance(classifier, type):
            classifier = classifier()
        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', classifier),
        ])
        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    leanings = labels.fit_transform(leanings)

    # Build model on training data.
    text_train, text_test, leanings_train, leanings_test = tts(text,
                                                               leanings,
                                                               test_size=0.2)
    #build(classifier, text_train, leanings_train)

    model = build(classifier, text_train, leanings_train)

    leanings_pred = model.predict(text_test)
    leanings_pred_prob = model.predict_proba(text_test)
    print(clsr(leanings_test, leanings_pred, target_names=labels.classes_))

    # Build model on all data.
    model = build(classifier, text, leanings)
    model.labels_ = labels

    return leanings_test, leanings_pred_prob, model
コード例 #15
0
    def build(classifier, X, y=None, export=False):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        features = []
        tfidf = Pipeline([('preprocessor', NLTKPreprocessor()),
                          ('vectorizer',
                           TfidfVectorizer(tokenizer=identity,
                                           preprocessor=None,
                                           lowercase=False))])
        features.append(('tfidf', tfidf))

        abstract = Pipeline([
            ('abstract_feature', AbstractStats()),
            ('vectorizer',
             DictVectorizer()),  # list of dicts -> feature matrix
        ])
        features.append(('abstract', abstract))
        feature_union = FeatureUnion(features)
        feature_extractor = Pipeline([
            ('feature_union', feature_union),
        ])
        # Label encode the targets
        X = feature_extractor.fit_transform(X)
        labels = LabelEncoder()
        y = labels.fit_transform(y)

        overall_accuracy = 0
        overall_f1 = 0
        overall_precision = 0
        overall_recall = 0

        best_accuracy = 0
        best_model = None
        best_y_test = None
        best_y_pred = None

        num_samples = 0
        stats = {}
        stats['accuracy'] = []
        stats['f1'] = []
        stats['precision'] = []
        stats['recall'] = []
        tests = numtests
        if export:
            tests = 1
        scaler = StandardScaler(with_mean=False)
        X = scaler.fit_transform(X)
        for i in range(tests):
            X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
            if classifier.__class__.__name__ == 'GaussianNB':
                classifier.fit(X_train.toarray(), y_train)
                y_pred = classifier.predict(X_test.toarray())
            else:
                classifier.fit(X_train, y_train)
                y_pred = classifier.predict(X_test)

            accuracy = metrics.accuracy_score(y_test, y_pred)
            f1 = metrics.f1_score(y_test, y_pred)
            precision = metrics.precision_score(y_test, y_pred)
            recall = metrics.recall_score(y_test, y_pred)

            stats['accuracy'].append(accuracy)
            stats['f1'].append(f1)
            stats['precision'].append(precision)
            stats['recall'].append(recall)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = classifier
                best_y_test = y_test
                best_y_pred = y_pred

        stats['avg_accuracy'] = sum(stats['accuracy']) / len(stats['accuracy'])
        stats['avg_f1'] = sum(stats['f1']) / len(stats['f1'])
        stats['avg_precision'] = sum(stats['precision']) / len(
            stats['precision'])
        stats['avg_recall'] = sum(stats['recall']) / len(stats['recall'])

        print('******Results after', num_samples, 'iterations*****')
        print('accuracy'.ljust(15), stats['avg_accuracy'])
        print('f1'.ljust(15), stats['avg_f1'])
        print('precision'.ljust(15), stats['avg_precision'])
        print('recall'.ljust(15), stats['avg_recall'])

        if verbose:
            print("Classification Report:\n")
            print(clsr(best_y_test, best_y_pred, target_names=labels.classes_))

        return (best_model, labels, feature_extractor, stats)
コード例 #16
0
def train_report(model, P, N):
    print("Classification Report (on training, not on test data!):\n")
    y_pred = model.predict(np.concatenate((P, N)))
    print(clsr([1. for _ in P] + [0. for _ in N], y_pred))
    return
コード例 #17
0
def eval_model(model, X, y):
    if X is not None and y is not None:
        y_pred = np.round(model.predict(X))
        print("Accuracy:", accuracy_score(y, y_pred))
        print(clsr(y, y_pred))
    return
コード例 #18
0
# utilities.viewTable(features_df)

X = features_df.as_matrix().astype(np.float)

skf = StratifiedKFold(n_splits=5, shuffle=True)

skf.get_n_splits(X, y)

# Split the data set in a training set (70%) and a test set (30%)

for train_index, test_index, in skf.split(X, y):

    X_train, X_test, = X[train_index], X[test_index]

    y_train, y_test = y[train_index], y[test_index]

    modelRF = RandomForestClassifier(n_estimators=2000,
                                     max_depth=5,
                                     class_weight="balanced",
                                     n_jobs=16)
    modelRF.fit(X_train, y_train)

    y_pred = modelRF.predict(X_test)
    y_predTr = modelRF.predict(X_train)

    print clsr(y_test, y_pred)
    print clsr(y_train, y_predTr)
    print cross_val_score(modelRF, X_test, y_test, scoring='f1')
    print cohen_kappa_score(y_pred, y_test)
コード例 #19
0
#Remove NaN entrys that may be present

features_df = clean_dataset(features_df)
# Create the X and y arrays

y = features_df['Outcome'].as_matrix().astype(np.float)

print len(features_df.columns)

# Remove the Outcome from the feature data
del features_df['Outcome']

print len(features_df.columns)
X = features_df.as_matrix().astype(np.float)

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

modelRF = joblib.load("trainedSte3Model.pkl")

y_pred = modelRF.predict(X_test)
y_pred = modelRF.predict(X_train)

print y_pred

print clsr(y_test, y_pred)
コード例 #20
0
def build_and_evaluate(X, y,classifier=SGDClassifier, outpath=None, verbose=True,test_size=0.2):
	"""
	Builds a classifer for the given list of threads and targets. I uses a union of four
	feature extractions, namely tf-idf for heading and post content (using NLTK preprocessor),
	thread metadata and content metadata.

	X: a list or integers corresponding with threadIDs
	y: a list or iterable of labels, which will be label encoded.

	Can specify the classifier to build with: if a class is specified then
	this will build the model with the Scikit-Learn defaults, if an instance
	is given, then it will be used directly in the build pipeline.

	If outpath is given, this function will write the model as a pickle.
	If verbose, this function will print out information to the command line.
	"""

	@timeit
	def build(classifier, X, y=None):
		"""
		Inner build function that builds a single model.
		"""
		if isinstance(classifier, type):
			classifier = classifier()


		model = Pipeline([
			# Extract the heading,content and numPosts
			('features', FeatureExtractor()),

			# Use FeatureUnion to combine the different features
			('union', FeatureUnion(
				transformer_list=[

					# Pipeline for pulling bag-of-words from the thread's heading, after tokenizing and lemmatizing
					# Then, it aplies TF-IDF vectorization on the BoW (also it can be truncated using SVD if needed)
					('heading', Pipeline([
						('selector', ItemSelector(key='numPosts_heading')),
						('preprocessor', NLTKPreprocessor()),
						('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
						#('featureselector', TruncatedSVD(n_components=20)),
					])),

					# Pipeline for pulling bag-of-words from the thread's first post content, after tokenizing and lemmatizing
					# Then, it aplies TF-IDF vectorization on the BoW and truncates using SVD
					('content', Pipeline([
						('selector', ItemSelector(key='content')),
						('preprocessor', NLTKPreprocessor()),
						('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
						('featureselector', TruncatedSVD(n_components=50)),
					])),

					# Pipeline for pulling ad hoc features from thhead's heading
					('threadStats', Pipeline([
						('selector', ItemSelector(key='numPosts_heading')),
						('stats', ThreadStats()),  # returns a list of dicts
						('vect', DictVectorizer()),  # list of dicts -> feature matrix
					])),
					# Pipeline for pulling ad hoc features from thread's first post content
					('postStats', Pipeline([
						('selector', ItemSelector(key='content')),
						('stats', PostsStats()),  # returns a list of dicts
						('vect', DictVectorizer()),  # list of dicts -> feature matrix
					])),

				],

				# weight components in FeatureUnion
				transformer_weights={
					'heading': 0.8,
					'content': 0.5,
					'threadStats': 1.0,
					'postStats': 1.0,
				},
			)),

			# Use a SVC classifier on the combined features
			('svc', SVC(kernel='linear')),
		])

		model.fit(X, y)
		return model

	# Label encode the targets
	labels = LabelEncoder()
	y = labels.fit_transform(y)

	# Begin evaluation
	if verbose: print("%s Building for evaluation"%datetime.now().strftime(tsFormat))
	X_train, X_test, y_train, y_test = tts(X, y, test_size=test_size)
	model, secs = build(classifier, X_train, y_train)
	if verbose: print("%s Evaluation model fit in %0.3f seconds"%(datetime.now().strftime(tsFormat),secs))

	if verbose:
		y_pred = model.predict(X_test)
		print("%s Classification Report:\n"%datetime.now().strftime(tsFormat))
		print(clsr(y_test, y_pred, target_names=labels.classes_))
		tn, fp, fn, tp= confusion_matrix(y_test,y_pred).ravel()
		print ("   CONFUSION MATRIX")
		print (" ---------------------")
		print ("        Predicted")
		print ("         'o' \t'p'")
		print ("        --------------")
		print ("Real 'o' |%s \t%s"%(tn,fp))
		print ("     'p' |%s \t%s"%(fn,tp))
		print ()
		print ("FPR:%.3f"%(float(fp)/(fp+tp)))
		print ("TPR:%.3f"%(float(tp)/(tp+fn)))
		print ("ACC:%.3f"%((float(tp)+tn)/(tp+fn+tn+fp)))
		print("%s Building complete model and saving ..."%datetime.now().strftime(tsFormat))

	model, secs = build(classifier, X, y)
	model.labels_ = labels

	if verbose: print("%s Complete model fit in %0.3f seconds"%(datetime.now().strftime(tsFormat),secs))

	if outpath:
		with open(outpath, 'wb') as f:
			pickle.dump(model, f)

		if verbose: print("%s Model written out to %s"%(datetime.now().strftime(tsFormat),outpath))

	return model
コード例 #21
0
def build_and_evaluate(X, y, outpath=None, verbose=True):
    """
    Builds a classifer for the given list of documents and targets in two
    stages: the first does a train/test split and prints a classifier report,
    the second rebuilds the model on the entire corpus and returns it for
    operationalization.
    X: a list or iterable of raw strings, each representing a document.
    y: a list or iterable of labels, which will be label encoded.
    Can specify the classifier to build with: if a class is specified then
    this will build the model with the Scikit-Learn defaults, if an instance
    is given, then it will be used directly in the build pipeline.
    If outpath is given, this function will write the model as a pickle.
    If verbose, this function will print out information to the command line.
    """
    @timeit
    def build(Classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if Classifier == "NB":
            classifier = MultinomialNB()
        elif Classifier == "SVC":
            classifier = SVC()
        elif Classifier == "LSVC":
            classifier = LinearSVC()
        elif Classifier == "LR":
            classifier = LogisticRegression()
        elif Classifier == "NN":
            classifier = MLPClassifier(solver='lbfgs',
                                       alpha=1e-5,
                                       hidden_layer_sizes=(10, 2),
                                       random_state=1)
        else:
            classifier = SGDClassifier()
        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False,
                             min_df=0.01,
                             max_df=0.95)),
            ('classifier', classifier),
        ])
        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    # pdb.set_trace()
    print("Train size %s" % len(X_train))
    print("Y train size %s" % len(y_train))
    model, secs = build("NN", X_train, y_train)
    #model, secs = build("KK", X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)
    tot = 0
    for l in range(len(y_pred)):
        if y_pred[l] != y_test[l]:
            tot = tot + 1
    print("Number of test %s, numer of errors %s" % (len(y_pred), tot))
    print(clsr(y_test, y_pred, target_names=labels.classes_))
    accuracy = acc(y_test, y_pred)
    print('Accuracy: {}'.format(accuracy))

    if verbose: print("Building complete model and saving ...")
    print("size of the total data is %s and total label %s" % (len(X), len(y)))
    model, secs = build("NN", X, y)
    #model, secs = build("KK", X, y)
    model.labels_ = labels
    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
コード例 #22
0
def build_and_evaluate(balanced,
                       X,
                       y,
                       classifier=LogisticRegression,
                       outpath=None,
                       verbose=True):
    def build(balanced, classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            # classifier = classifier()
            if balanced == True:
                class_weight = 'balanced'
                # neg_count = 0
                # neu_count = 0
                # pos_count = 0
                # for label in y:
                #     if label == 0:
                #         neg_count += 1
                #     elif label == 1:
                #         neu_count += 1
                #     elif label == 2:
                #         pos_count += 1
                #
                # if(len(set(y))) == 3:
                #     minimum = min(neg_count, neu_count, pos_count)
                #     class_weight = {0: minimum/neg_count, 1: minimum/neu_count, 2: minimum/pos_count}
                # elif (len(set(y))) == 2:
                #     pos_count = neu_count
                #     minimum = min(neg_count, pos_count)
                #     class_weight = {0: minimum/neg_count, 1: minimum/pos_count }
                # print('0:', neg_count, '1:', neu_count, '2:', pos_count)
                # print(class_weight)
            else:
                class_weight = None
            classifier = classifier(multi_class='multinomial',
                                    solver='saga',
                                    class_weight=class_weight)
            # classifier = classifier(max_iter=1000, class_weight = class_weight)
            # classifier = classifier(class_weight=class_weight, C=1)
            # classifier = classifier(class_weight = class_weight)

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            # ('vectorizer', CountVectorizer(tokenizer=identity,preprocessor=None,lowercase=None,ngram_range =(1,2))),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=None,
                             ngram_range=(1, 2))),
            # ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')),
            # ('feature_selection', SelectPercentile(percentile=50)),
            ('feature_selection',
             SelectPercentile(score_func=chi2, percentile=90)),
            # ('to_dense', DenseTransformer()),
            # ('standardization', StandardScaler(with_mean=False)),
            # ('feature_selection', VarianceThreshold(threshold=(.8 * (1 - .8)))),
            ('classifier', classifier),
        ])
        # parameters = {
        # # 'vectorizer__max_features': [85000,100000,125000,150000]
        # # 'vectorizer__max_df': [0.5,0.6,0.7,0.8]
        # # 'classifier__loss': ['log', 'modified_huber', 'squared_hinge', 'perceptron']
        # 'classifier__multi_class': ['multinomial', 'ovr'],
        # 'classifier__solver': ['newton-cg', 'sag', 'saga', 'lbfgs']
        # }
        # grid = GridSearchCV(model,param_grid=parameters)
        # grid.fit(X,y)
        #
        # print("Best: %f using %s" % (grid.best_score_,
        # grid.best_params_))
        # means = grid.cv_results_['mean_test_score']
        # stds = grid.cv_results_['std_test_score']
        # params = grid.cv_results_['params']
        # for mean, stdev, param in zip(means, stds, params):
        #     print("%f (%f) with: %r" % (mean, stdev, param))
        # return grid
        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose:
        print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)
    model = build(balanced, classifier, X_train, y_train)

    y_pred = model.predict(X_test)
    y_actual = pd.Series(y_test, name='Actual')
    y_predicted = pd.Series(y_pred, name='Predicted')
    df_confusion = pd.crosstab(y_actual,
                               y_predicted,
                               rownames=['Actual'],
                               colnames=['Predicted'],
                               margins=True)

    if verbose:
        print("Confusion Matrix:\n")
    print(df_confusion)

    if verbose:
        print("Classification Report:\n")
    print(clsr(y_test, y_pred, target_names=labels.classes_, digits=4))
    print(
        accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) *
        100)
    # seed = 7
    # kfold = StratifiedKFold(n_splits=5)
    # scores = cross_val_score(model, X_train, y_train, cv=kfold)
    # print(scores)
    # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    if verbose:
        print("Building complete model and saving ...")
    model = build(balanced, classifier, X, y)
    model.labels_ = labels

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
コード例 #23
0
array = []
label = []
#Load trained model
model_dir = "./"
model_file = "model.pickle"
model_name = model_dir + model_file
with open(model_name, 'rb') as f:
    model = pickle.load(f)

#Load test data
filenames = ['test_BGS.neg', 'test_BGS.pos']
with open(filenames[0], "r") as pos, open(filenames[1]) as neg:
    for line in pos:
        array.append(line)
        label.append(1)
    for line in neg:
        label.append(0)
        array.append(line)
labels = LabelEncoder()
y = labels.fit_transform(label)

y_pred = model.predict(array)
tot = 0
np.asarray(label)
for l in range(len(y_pred)):
    if y_pred[l] != label[l]:
        tot = tot + 1
print(clsr(label, y_pred))
accuracy = acc(label, y_pred)
print("Accuracy:{}".format(accuracy))
コード例 #24
0
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True):
    """
    Builds a classifer for the given list of documents and targets in two
    stages: the first does a train/test split and prints a classifier report,
    the second rebuilds the model on the entire corpus and returns it for
    operationalization.
    X: a list or iterable of raw strings, each representing a document.
    y: a list or iterable of labels, which will be label encoded.
    Can specify the classifier to build with: if a class is specified then
    this will build the model with the Scikit-Learn defaults, if an instance
    is given, then it will be used directly in the build pipeline.
    If outpath is given, this function will write the model as a pickle.
    If verbose, this function will print out information to the command line.
    """

    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier()

        #print ('TfidfVectorizer',TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False))

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)),
            ('classifier', classifier),
        ])

        model.fit(X, y)
        print('model created')
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose: print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
    model, secs = build(classifier, X_train, y_train)

    if verbose: print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose: print("Classification Report:\n")

    y_pred = model.predict(X_test)
    #Evaluating the model
    print(clsr(y_test, y_pred, target_names=labels.classes_))

    if verbose: print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
コード例 #25
0
def build_and_evaluate(X,
                       y,
                       n=None,
                       classifier=LogisticRegressionCV,
                       outpath=None,
                       verbose=True,
                       multiclass=False):
    """
    Builds a classifer for the given list of documents and targets in two
    stages: the first does a train/test split and prints a classifier report,
    the second rebuilds the model on the entire corpus and returns it for
    operationalization.
    X: a list or iterable of raw strings, each representing a document.
    y: a list or iterable of labels, which will be label encoded.
    Can specify the classifier to build with: if a class is specified then
    this will build the model with the Scikit-Learn defaults, if an instance
    is given, then it will be used directly in the build pipeline.
    If outpath is given, this function will write the model as a pickle.
    If verbose, this function will print out information to the command line.
    """
    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            if multiclass:
                classifier = classifier(cv=10,
                                        random_state=0,
                                        max_iter=1000,
                                        solver='newton-cg',
                                        multi_class="multinomial")
            else:
                classifier = classifier(cv=10,
                                        random_state=0,
                                        max_iter=1000,
                                        solver='newton-cg')

        # gridsearch_pipe = Pipeline([
        #     # ('preprocessor', TextNormalizer_lemmatize()),
        #     ('vectorizer', TfidfVectorizer(
        #         tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1,2))
        #         ),
        #     ('classifier', classifier)
        # ])

        # maxdf = [0.85, .90]
        # mindf = (4, 3, 2)
        # nfeat = [ 13500, 13600, 13700]
        # ngrams = [(1, 1), (1, 2), (1,3)]
        # param_grid = {'vectorizer__max_df':maxdf, 'vectorizer__min_df':mindf, 'vectorizer__ngram_range':ngrams,
        # 'vectorizer__max_features':nfeat}

        # C = np.logspace(0, 4, 10)
        # penalty = [ 'l1','l2' ]
        # param_grid = {'classifier__C':C, 'classifier__penalty':penalty}
        # grid_search = GridSearchCV(gridsearch_pipe, param_grid, cv=6)
        # grid_search.fit(X, y)
        # best_param = grid_search.best_params_
        # print(best_param)

        # vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False,
        # max_df=best_param['vectorizer__max_df'], min_df=best_param['vectorizer__min_df'],
        # ngram_range=best_param['vectorizer__ngram_range'], max_features=best_param['vectorizer__max_features'])
        # classifier = LogisticRegression( random_state=0, max_iter=1000, penalty=best_param['classifier__penalty'], C=best_param['classifier__C'])

        # vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1,2), max_features=13000,
        #                 max_df=0.85, min_df=2 )
        #form
        vectorizer = TfidfVectorizer(tokenizer=identity,
                                     preprocessor=None,
                                     lowercase=False,
                                     ngram_range=(1, 2),
                                     max_features=13500,
                                     max_df=0.85,
                                     min_df=23)

        model = Pipeline([
            # ('preprocessor', TextNormalizer_lemmatize()),
            ('vectorizer', vectorizer),
            ('classifier', classifier),
        ])

        model.fit(X, y)

        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if n:
        if verbose: print("splitting test and test set by: " + str(n))
        n_samples = len(y)
        indicies = np.arange(n_samples)
        X_train, X_test, y_train, y_test, idx_train, idx_test = tts(
            X, y, indicies, test_size=n, stratify=y)
        # X_train, X_test, y_train, y_test = X[:n], X[n:], y[:n], y[n:]
        from collections import Counter
        print('y_train', Counter(y_train))

        model, secs = build(classifier, X_train, y_train)
        model.labels_ = labels

        if verbose:
            print("Evaluation model fit in {:0.3f} seconds".format(secs))
        y_pred = model.predict(X_test)

        if verbose: print("Classification Report:\n")
        print(clsr(y_test, y_pred, target_names=labels.classes_))
        print(cm(y_test, y_pred))
        print('acc', accuracy_score(y_test, y_pred))
        print('f1', f1_score(y_test, y_pred, average='weighted'))

        if verbose: print("Evaluation of naive prediction ...")
        y_naive = [0] * len(y_test)
        print(type(y_test))
        print('acc naive', accuracy_score(y_test, y_naive))

    else:
        if verbose: print("Building for evaluation with full set")
        model, secs = build(classifier, X, y)
        model.labels_ = labels

        if verbose:
            print("Evaluation model fit in {:0.3f} seconds".format(secs))
        y_pred = model.predict(X)

        if verbose: print("Classification Report:\n")
        print(clsr(y, y_pred, target_names=labels.classes_))
        print(cm(y, y_pred))
        print(accuracy_score(y, y_pred))

        if verbose: print("Evaluation of naive prediction ...")
        y_naive = [0] * len(y)
        print(type(y))
        print('acc naive', accuracy_score(y, y_naive))

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model, y_pred, idx_test
コード例 #26
0
ファイル: Tweet-Filterv4.py プロジェクト: Mottl/Crypto-Tweet
def build_and_evaluate(body, tag, outpath=None):
    def build(body, tag):
        model = Pipeline([
            ('preprocessor', Preprocessor()),
            ('vectorizer',
             TfidfVectorizer(sublinear_tf=True,
                             min_df=5,
                             norm='l2',
                             tokenizer=identity,
                             preprocessor=None,
                             lowercase=False)),
            ('classifier', MultinomialNB()),
        ])
        model.fit(body, tag)
        return model

    print("Encoding labels. . .")
    labels = LabelEncoder()
    tags = labels.fit_transform(tag)

    print("Spitting training and testing data set . . .")
    body_train, body_test, tag_train, tag_test = tts(body, tags)
    print("Building model . . .")
    model = build(body_train, tag_train)

    print("Classification report:\n")
    y_prediction = model.predict(body_test)
    results = clsr(tag_test, y_prediction, target_names=labels.classes_)
    print(results)

    #Let's use Cross Validation and the GridSearch to see if we can improve upon our accuracy.
    #Define the paramaters to be fine tuned
    paramaters = {
        'vectorizer__sublinear_tf': (True, False),
        'vectorizer__min_df': (4, 5, 7, 10),
        'classifier__alpha': (0.5, 0.2, 0.1, 0.01, 0.001)
    }

    #Create the GridSearchCV from our existing model
    grid_search = GridSearchCV(model, paramaters, cv=5, n_jobs=-1)
    grid_search.fit(body_train, tag_train)
    #GridSearch returns a cv_results paramater but also has several best_ attributes that aren't included in the cv_result dictionary. Lets make our own dic.
    data = {
        "best_estimator": grid_search.best_estimator_,
        "best_score": grid_search.best_score_,
        "best_index": grid_search.best_index_,
        "best_params": grid_search.best_params_,
    }

    # #lets print out the results and take a look at how we're doing
    # print("The best params:")
    # print(data)
    # print("The Cross validation Result:")
    # print(grid_search.cv_results_)

    # #Double check performance with the testing porition of our data set.
    # tag_real, body_predict = tag_test, grid_search.predict(body_test)

    # result = clsr(tag_real,body_predict)
    # print (result)

    #We have the optimal paramaters so lets grab the best estimator and refit it to the full data set and retest accuracy.
    model = grid_search.best_estimator_
    tag_real, body_predict = tag_test, grid_search.predict(body_test)

    result = clsr(tag_real, body_predict)
    print(result)

    return model
コード例 #27
0
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)


TRAIN_DATA = "C:\\Users\\Shobha Rani\\Desktop\\shobha\\NLP_544\\Project\\Data\\Train"
TEST_DATA = "C:\\Users\\Shobha Rani\\Desktop\\shobha\\NLP_544\\Project\\Data\\Test"

train = load_files(TRAIN_DATA)
X_train = train.data
y_train = train.target

test = load_files(TEST_DATA)
X_test = test.data
y_test = test.target

model = Pipeline([
           ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(decode_error='ignore', tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2))),
            ('classifier', RandomForestClassifier(n_estimators=100)),
        ])

model.fit(X_train, y_train)

print("Classification Report:\n")

y_pred = model.predict(X_test)
print(clsr(y_test, y_pred, target_names=test.target_names))

print("COMPLETE")
コード例 #28
0
def build_and_evaluate(X_train,
                       y_train,
                       X_test,
                       y_test,
                       X_val,
                       y_val,
                       args,
                       classifier=svm.LinearSVC(),
                       parameters=None):
    def build(classifier, X_train, y_train, X_test, y_test):
        """
        Inner build function that builds a single model using only ngrams.
        """
        model = Pipeline([
            ('vect',
             TfidfVectorizer(preprocessor=NLTKPreprocessor(stem=True),
                             lowercase=True,
                             stop_words=None,
                             ngram_range=(1, 2),
                             sublinear_tf=True)),
            ('classif', classifier),
        ])
        if parameters:
            clf = GridSearchCV(model, parameters)
            clf.fit(X_train, y_train)
            means = clf.cv_results_['mean_test_score']
            stds = clf.cv_results_['std_test_score']
            for mean, std, params in zip(means, stds,
                                         clf.cv_results_['params']):
                print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
            print()
            return clf
        else:
            X = np.append(X_train, X_test, axis=0)
            y = np.append(y_train, y_test, axis=0)
            kf = KFold(n_splits=4, shuffle=True)
            for train, test in kf.split(X):
                model.fit(X[train], y[train])
                print(model.score(X[test], y[test]))
            model.fit(X_train, y_train)
            print(model.score(X_test, y_test))
        return model

    labels, y_train, y_test, y_val = generate_labels(y_train, y_test, y_val)
    #labels = LabelEncoder()
    #try:
    #    y_test = labels.fit_transform(y_test)
    #    y_val = labels.fit_transform(y_val)
    #except:
    #    type, value, tb = sys.exc_info()
    #    traceback.print_exc()
    #    pdb.post_mortem(tb)

    train_dependency_relations = pickle.load(
        open(args.data + "train_dependency_rel.p", "rb"))
    test_dependency_relations = pickle.load(
        open(args.data + "test_dependency_rel.p", "rb"))
    train_dependency_tree = pickle.load(
        open(args.data + "train_dependency_tree.p", "rb"))
    test_dependency_tree = pickle.load(
        open(args.data + "test_dependency_tree.p", "rb"))

    Xtr, Xte, ytr, yte = X_train, X_test, y_train, y_test

    # Xtr, ytr, train_dependency_relations, train_dependency_tree = balance_data(Xtr, ytr, train_dependency_relations, train_dependency_tree,labels,sampling = 2)

    prep = NLTKPreprocessor(stem=True)
    vect = TfidfVectorizer(preprocessor=prep,
                           lowercase=True,
                           stop_words=None,
                           ngram_range=(1, 2))
    vect.fit_transform(Xtr["Message"])

    X_tr_mat, f_tr_labels = generate_features(
        vect,
        Xtr["Message"],
        train_dependency_tree,
        train_dependency_relations,
        args.data + 'train',
    )
    # X_te_mat, f_te_labels = generate_features(vect, Xte["Message"], test_dependency_relations, args.data+'tune')
    X_val_mat, f_val_labels = generate_features(
        vect,
        X_val["Message"],
        test_dependency_tree,
        test_dependency_relations,
        args.data + 'test',
    )

    # write train to weka for feature analysis
    if args.to_weka:
        vectors_train = np.array(
            gen_msg_features(Xtr["Message"], args.data + 'train'))
        vectors_test = np.array(
            gen_msg_features(Xte["Message"], args.data + 'tune'))
        vectors_val = np.array(
            gen_msg_features(X_val["Message"], args.data + 'test'))

        header_train = f_tr_labels.tolist()
        header_train.append(labels.classes_)
        sk_to_weka(vectors_train[1], ytr, header_train,\
                   '_'.join(labels.classes_)+'_train_features.arff')
        sk_to_weka(vectors_test[1], y_test, header_train,\
                   '_'.join(labels.classes_)+'_tune_features.arff')
        sk_to_weka(vectors_val[1], y_val, header_train,\
                   '_'.join(labels.classes_)+'_test_features.arff')

    cls = classifier

    if args.gridsearch:
        logger.info("Performing GridSearch on train data")
        clf = GridSearchCV(estimator=cls, param_grid=parameters)
        clf.fit(X_tr_mat, ytr)
        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))

    if args.clf >= 4:
        X_tr_mat = X_tr_mat.toarray()
        X_te_mat = X_te_mat.toarray()
        X_val_mat = X_val_mat.toarray()
    if args.cv != 0:
        cross_validate(cls, X_tr_mat, ytr, args)

    # Three level hierarchy classifiers
    # Classifying between general and non-general
    ytr_gen = np.array(
        [1 if labels.inverse_transform(i) == 'general' else 0 for i in ytr])
    y_val_gen = np.array(
        [1 if labels.inverse_transform(i) == 'general' else 0 for i in y_val])
    print("Classifying general cases : \n\n")
    X_tr_mat_o, ytr_gen_o = imbalance_sampling(X_tr_mat, ytr_gen, 0)
    cls.fit(X_tr_mat_o, ytr_gen_o)
    y_pred_gen = (cls.predict(X_val_mat))
    y_pred_overall = np.array([
        labels.transform(['general'])[0] if i == 1 else -1 for i in y_pred_gen
    ])
    print(confusion_matrix(y_val_gen, y_pred_gen))
    scores = clsr(y_val_gen, y_pred_gen)
    scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\
                                scores.split("\n")))
    #scores[0] = '\t' + scores[0]
    scores[-2] = '\t' + scores[-2]
    scores = '\n'.join(scores)
    print(scores)
    missclassified(X_val["Message"], y_val_gen, y_pred_gen, "general",
                   "nonGeneral")

    # Classifying between (td_event,td_non_event) and (emergency,urgent)
    X_tr_mat_non_gen = X_tr_mat[(
        ytr != labels.transform(['general'])[0]).nonzero()[0], :]
    ytr_non_gen = ytr[ytr != labels.transform(['general'])[0]]
    ytr_non_gen = np.array([
        1 if (labels.inverse_transform(i) == 'td_event'
              or labels.inverse_transform(i) == 'td_non_event') else 0
        for i in ytr_non_gen
    ])

    X_val_mat_non_gen = X_val_mat[(y_pred_gen != 1).nonzero()[0], :]
    y_val_non_gen = y_val[y_pred_gen != 1]
    y_val_non_gen = np.array([
        1 if (labels.inverse_transform(i) == 'td_event'
              or labels.inverse_transform(i) == 'td_non_event') else 0
        for i in y_val_non_gen
    ])
    print("Classifying non-general cases : \n\n")
    X_tr_mat_non_gen_o, ytr_non_gen_o = imbalance_sampling(
        X_tr_mat_non_gen, ytr_non_gen, 0)
    cls.fit(X_tr_mat_non_gen_o, ytr_non_gen_o)
    y_pred_non_gen = (cls.predict(X_val_mat_non_gen))
    print(confusion_matrix(y_val_non_gen, y_pred_non_gen))
    scores = clsr(y_val_non_gen, y_pred_non_gen)
    scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\
                                scores.split("\n")))
    #scores[0] = '\t' + scores[0]
    scores[-2] = '\t' + scores[-2]
    scores = '\n'.join(scores)
    print(scores)

    # Classifying between td_event and td_non_event
    X_tr_mat_td = X_tr_mat[np.array([
        y in labels.transform(['td_event', 'td_non_event']) for y in ytr
    ]).nonzero()[0]]
    ytr_td = ytr[[
        y in labels.transform(['td_event', 'td_non_event']) for y in ytr
    ]]
    ytr_td = np.array([
        1 if labels.inverse_transform(i) == 'td_event' else 0 for i in ytr_td
    ])

    X_val_mat_td = X_val_mat_non_gen[(y_pred_non_gen == 1).nonzero()[0], :]
    y_val_td = (y_val[y_pred_gen != 1])[y_pred_non_gen == 1]
    y_val_td = np.array([
        1 if labels.inverse_transform(i) == 'td_event' else 0 for i in y_val_td
    ])
    td_pred_indices = (y_pred_gen != 1).nonzero()[0][y_pred_non_gen == 1]
    print("Classifying td_event and td_non_event : \n\n")
    X_tr_mat_td_o, ytr_td_o = imbalance_sampling(X_tr_mat_td, ytr_td, 0)
    cls.fit(X_tr_mat_td_o, ytr_td_o)
    y_pred_td = (cls.predict(X_val_mat_td))
    y_pred_td_label = np.array([
        labels.transform(['td_event'])[0]
        if i == 1 else labels.transform(['td_non_event'])[0] for i in y_pred_td
    ])
    y_pred_overall[td_pred_indices] = y_pred_td_label
    print(confusion_matrix(y_val_td, y_pred_td))
    scores = clsr(y_val_td, y_pred_td)
    scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\
                                scores.split("\n")))
    #scores[0] = '\t' + scores[0]
    scores[-2] = '\t' + scores[-2]
    scores = '\n'.join(scores)
    print(scores)

    # Classifying between emergency and urgent
    X_tr_mat_emer = X_tr_mat[np.array([
        y in labels.transform(['emergency', 'urgent']) for y in ytr
    ]).nonzero()[0]]
    ytr_emer = ytr[[
        y in labels.transform(['emergency', 'urgent']) for y in ytr
    ]]
    ytr_emer = np.array([
        1 if labels.inverse_transform(i) == 'emergency' else 0
        for i in ytr_emer
    ])

    X_val_mat_emer = X_val_mat_non_gen[(y_pred_non_gen == 0).nonzero()[0], :]
    y_val_emer = (y_val[y_pred_gen != 1])[y_pred_non_gen == 0]
    y_val_emer = np.array([
        1 if labels.inverse_transform(i) == 'emergency' else 0
        for i in y_val_emer
    ])
    emer_pred_indices = (y_pred_gen != 1).nonzero()[0][y_pred_non_gen == 0]
    print("Classifying emergency and urgent : \n\n")
    X_tr_mat_emer_o, ytr_emer_o = imbalance_sampling(X_tr_mat_emer, ytr_emer,
                                                     0)
    cls.fit(X_tr_mat_emer_o, ytr_emer_o)
    y_pred_emer = (cls.predict(X_val_mat_emer))
    y_pred_emer_label = np.array([
        labels.transform(['emergency'])[0]
        if i == 1 else labels.transform(['urgent'])[0] for i in y_pred_emer
    ])
    y_pred_overall[emer_pred_indices] = y_pred_emer_label
    print(confusion_matrix(y_val_emer, y_pred_emer))
    scores = clsr(y_val_emer, y_pred_emer)
    scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\
                                scores.split("\n")))
    #scores[0] = '\t' + scores[0]
    scores[-2] = '\t' + scores[-2]
    scores = '\n'.join(scores)
    print(scores)

    print("Overall classification scores : ")
    print(confusion_matrix(y_val, y_pred_overall))
    scores = clsr(y_val, y_pred_overall)
    scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\
                                scores.split("\n")))
    #scores[0] = '\t' + scores[0]
    scores[-2] = '\t' + scores[-2]
    scores = '\n'.join(scores)
    print(scores)

    #Three level hierarchy classifiers ends here

    # X_tr_mat, ytr = imbalance_sampling(X_tr_mat, ytr,2)
    # lda = LinearDiscriminantAnalysis()
    # X_tr_mat = lda.fit_transform(X_tr_mat.toarray(), ytr)

    cls.fit(X_tr_mat, ytr)

    y_pred = (cls.predict(X_val_mat))

    # y_pred = postProcess(y_pred,X_val["Message"],labels)

    print(confusion_matrix(y_val, y_pred))
    conf_f_name = '_'.join(labels.classes_) + '_cfm.tsv'

    np.savetxt(conf_f_name, confusion_matrix(y_val, y_pred), delimiter='\t',\
               fmt="%2.1d", header='\t'.join(labels.classes_))
    scores = clsr(y_val, y_pred)
    scores = list(map(lambda r: re.sub('\s\s+', '\t', r),\
                                scores.split("\n")))
    #scores[0] = '\t' + scores[0]
    scores[-2] = '\t' + scores[-2]
    scores = '\n'.join(scores)

    print(scores)
    with open('_'.join(labels.classes_) + '_scores.tsv', 'w') as f:
        f.write('\t' + scores)

    if args.save:
        fitted_model = {'vect': vect, 'cls': cls}
        logger.info("Saving model")
        pickle.dump(fitted_model,\
                    open('_'.join(sorted(labels.classes_))+'_'+str(cls)[0:10]+'.model', 'wb'))
    misclassifications_class(cls, y_pred, X_val_mat, y_val, X_val, labels,
                             args.level, True)
    if args.with_graph:
        plot_feat(vectors_val[1], vectors_val[0], labels, y_val,
                  'Average Feature value for each class')
    if args.roc:
        roc_auc(X_val_mat, y_val, cls)
コード例 #29
0
def write_txt(data, name):
    data = ''.join(str(word) for word in data)
    file = open(base_dir + name, 'w')
    file.write(data)
    file.close()


#preprocessing(inpath=path, name="data.csv", mix=True)
#print("Preprocessing the Test Data")
#preprocessing(inpath=test_path, name="imdb_te.csv", mix=True)
[xtrain, ytrain] = retrieve_data("reviews.csv")
#[xtest, ytest] = retrieve_data("imdb_te.csv")
xtrain, xtest, ytrain, ytest = tts(xtrain, ytrain, test_size=0.5)
labels = LabelEncoder()
ytrain = labels.fit_transform(ytrain)
ytest = labels.fit_transform(ytest)
print("--------------Vectorizing  on Sample Data----------------")
tfidf_vector = tfidf_process(xtrain)
with open(base_dir + "vectorizer", 'wb') as f:
    pickle.dump(tfidf_vector, f)
xtrain_tf = tfidf_vector.transform(xtrain)
print("--------------Vectorizing  on Test Data----------------")
xtest_tf = tfidf_vector.transform(xtest)
ypred = stochastic_descent(xtrain_tf, ytrain, xtest_tf)
print(ypred, labels.classes_)
write_txt(ypred, name="output.txt")
print("\nAccuracy Score : ", accuracy_score(ytest, ypred))
print("\nConfusion : \n", confusion_matrix(ytest, ypred))
print("\nCLSR----------------------\n",
      clsr(ytest, ypred, target_names=labels.classes_))
コード例 #30
0
    def build_and_evaluate(self,
                           x,
                           y,
                           preprocessor=None,
                           classifier=None,
                           feature_booster=None,
                           outpath=None,
                           verbose=True,
                           test_size=0.2,
                           print_report=True,
                           Kfold_test=True,
                           n_splits=5):

        # Label encode the targets
        labels = LabelEncoder()
        y = labels.fit_transform(y)

        # Begin evaluation
        if verbose:
            log.info("Building for evaluation")

        if Kfold_test:
            log.info("Performing K-fold test...")
            skf = StratifiedKFold(n_splits=n_splits,
                                  shuffle=False,
                                  random_state=0)
            split_count = 1
            average_accuracy = 0
            accuracy_list = []
            average_precision = 0
            average_recall = 0
            avearage_f1_score = 0

            for train_index, test_index in skf.split(x, y):
                # print("TRAIN:", train_index, "TEST:", test_index)
                x_train, x_test = x[train_index], x[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model, secs = self.build(x=x_train,
                                         y=y_train,
                                         preprocessor=preprocessor,
                                         classifier=classifier,
                                         feature_booster=feature_booster)

                if verbose:
                    log.info(
                        "Evaluation model fit in {:0.3f} seconds".format(secs))
                    log.info("Classification Report for fold-{}:\n".format(
                        split_count))

                y_pred = model.predict(x_test)

                report = clsr(y_test, y_pred, target_names=labels.classes_)
                accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
                f1_s = f1_score(y_true=y_test,
                                y_pred=y_pred,
                                labels=labels.classes_,
                                average="weighted")
                precision = precision_score(y_true=y_test,
                                            y_pred=y_pred,
                                            labels=labels.classes_,
                                            average="weighted")
                recall = recall_score(y_true=y_test,
                                      y_pred=y_pred,
                                      labels=labels.classes_,
                                      average="weighted")
                confusion_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)

                fpr, tpr, _ = roc_curve(y_test, y_pred)
                area_under_curve = auc(fpr, tpr)
                print("Area under curve:", area_under_curve)

                accuracy_list.append(accuracy)

                # plt.plot(fpr, tpr)

                # plt.step(recall_, precision_, color='b', alpha=0.2, where='post')
                # plt.fill_between(recall_, precision_, step='post', alpha=0.2, color='b')

                # plt.xlabel('Recall')
                # plt.ylabel('Precision')
                # plt.show()

                if print_report:
                    print(report)
                    print("Accuracy: {}".format(accuracy))
                    print("F1-Score: {}".format(f1_s))
                    print("Precision: {}".format(precision))
                    print("Recall: {}".format(recall))
                    print("Confusion-Matrix: ")
                    print(confusion_mat)
                average_accuracy = (average_accuracy * (split_count - 1) +
                                    (accuracy)) / (split_count)
                avearage_f1_score = (avearage_f1_score * (split_count - 1) +
                                     (f1_s)) / (split_count)
                average_precision = (average_precision * (split_count - 1) +
                                     (precision)) / (split_count)
                average_recall = (average_recall * (split_count - 1) +
                                  (recall)) / (split_count)

                split_count += 1
            std_dev_accuracy = np.std(accuracy_list)
            if print_report:
                print("\n************ K-fold results ************")
                print("Average-Accuracy: {}".format(average_accuracy))
                print("Average-F1-Score: {}".format(avearage_f1_score))
                print("Average-Precision: {}".format(average_precision))
                print("Average-Recall: {}".format(average_recall))
                print("*******************************************")

        else:
            x_train, x_test, y_train, y_test = tts(x, y, test_size=test_size)

            if verbose:
                log.info("Completed train test split")

            model, secs = self.build(x=x_train,
                                     y=y_train,
                                     preprocessor=preprocessor,
                                     classifier=classifier,
                                     feature_booster=feature_booster)

            if verbose:
                log.info(
                    "Evaluation model fit in {:0.3f} seconds".format(secs))
                log.info("Classification Report:\n")

            y_pred = model.predict(x_test)

            report = clsr(y_test, y_pred, target_names=labels.classes_)
            accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
            f1_s = f1_score(y_true=y_test,
                            y_pred=y_pred,
                            labels=labels.classes_,
                            average="weighted")
            precision = precision_score(y_true=y_test,
                                        y_pred=y_pred,
                                        labels=labels.classes_,
                                        average="weighted")
            recall = recall_score(y_true=y_test,
                                  y_pred=y_pred,
                                  labels=labels.classes_,
                                  average="weighted")
            confusion_mat = confusion_matrix(y_true=y_test, y_pred=y_pred)

            # fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred)
            fpr, tpr, _ = roc_curve(y_test, y_pred)
            area_under_curve = auc(fpr, tpr)
            print(area_under_curve)
            # plt.step(recall_, precision_, color='b', alpha=0.2, where='post')
            # plt.fill_between(recall_, precision_, step='post', alpha=0.2, color='b')

            if print_report:
                print(report)
                print("Accuracy: {}".format(accuracy))
                print("F1-Score: {}".format(f1_s))
                print("Precision: {}".format(precision))
                print("Recall: {}".format(recall))
                print("Confusion-Matrix: ")
                print(confusion_mat)

        if verbose:
            log.info("Building complete model and saving ...")

        model, secs = self.build(x=x,
                                 y=y,
                                 preprocessor=preprocessor,
                                 classifier=classifier,
                                 feature_booster=feature_booster)

        model.labels_ = labels

        if verbose:
            log.info("Complete model fit in {:0.3f} seconds".format(secs))

        if outpath:
            with open(outpath, 'wb') as doc:
                pickle.dump(model, doc)
            log.info("Model written out to {}".format(outpath))

        json_path = outpath.split(".")[0] + ".json"
        # import ipdb; ipdb.set_trace()
        with open(json_path, 'w') as outfile:
            model_json = model.named_steps["classifier"].get_params()
            model_json.update(
                {"model_name": str(type(model.named_steps["classifier"]))})
            json.dump(model_json, outfile)

        return model, average_accuracy, std_dev_accuracy