Beispiel #1
0
    def fit(self):

        with open('rev_lab.pickle', 'rb') as f:
            reviews = pickle.load(f)
            labels = pickle.load(f)

        vectorizer = CountVectorizer(min_df=2, tokenizer=word_tokenize)
        counts = vectorizer.fit_transform(reviews)  # fitted with train data

        transformer = TfidfTransformer()
        tf_idf = transformer.fit_transform(counts)  # the same as vectorizer

        X_train, X_test, y_train, y_test = train_test_split(tf_idf,
                                                            labels,
                                                            train_size=0.9,
                                                            random_state=42)

        classifier = LogisticRegression(C=7, solver='liblinear')
        classifier.fit(X_train, y_train)

        print(accuracy_score(y_test, classifier.predict(X_test)))
        print(recall_score(y_test, classifier.predict(X_test)))

        with open('regression.pickle', 'wb') as f:
            pickle.dump(classifier, f)
            pickle.dump(vectorizer, f)
            pickle.dump(transformer, f)
Beispiel #2
0
def func1():
    # n_features 设置多少个特征,多维的特征
    # n_informative 有几个属性是有特别关联的
    # n_targets 有多少个输出
    #X,y = datasets.make_regression(n_samples=100, n_features=100, n_informative=10, n_targets=1, noise=0.0, bias=0.0, random_state=None)
    X, y = datasets.make_regression(n_samples=10, n_features=2)

    print('line X ', X)
    print('line y ', y)

    #plt.scatter(X[:,0],y)
    #plt.show()

    lm = LinearRegression()
    lm.fit(X, y)
    X_test = [[0.69803203, 0.62000084]]
    y_predict = lm.predict(X_test)
    print(y_predict)

    X, y = datasets.make_moons(10, noise=0.2)
    print('logis ', X)
    print('logis ', y)
    #X_train,X_test,y_train,y_test = train_test_split(X,y)
    logis_regre = LogisticRegression()
    #y = [[-140.66643209],[114.7982953],[103.11834249],[-177.27466722],[24.48139711],[-30.44916242],[38.96288527],[-57.62121771],[82.14111136],[90.54966151]]
    logis_regre.fit(X, y)
    print(logis_regre.predict(X_test))

    logis_regre = LogisticRegressionCV()
    logis_regre.fit(X, y)
    print(logis_regre.predict(X_test))
Beispiel #3
0
def train(dirname1, dirname2, dirname3, dirname4):

    s_list_train, w_list_train, label_train = load_train_set(
        dirname1, dirname2)
    s_list_test, w_list_test, label_test = load_train_set(dirname3, dirname4)

    classifer_single = LogisticRegression()
    classifer_word = LogisticRegression()

    classifer_single.fit(s_list_train, label_train)
    classifer_word.fit(w_list_train, label_train)

    predictions_s = classifer_single.predict(s_list_test)
    predictions_w = classifer_word.predict(w_list_test)

    length = len(predictions_s)
    count1, count2 = 0, 0

    for i in range(length):
        if predictions_s[i] == label_test[i]:
            count1 += 1
        if predictions_w[i] == label_test[i]:
            count2 += 1

    print("the accuracy of classifier for single: " + str(count1 / length) +
          "\nthe accuracy of classifier for word: " + str(count2 / length))

    return predictions_s, predictions_w, label_train, label_test
def lr_training_and_test(X_train, X_test, y_train, y_test):
    print 'model: logistic regression.'
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_train_pred_prob = model.predict_proba(X_train)[:, 1]
    y_test_pred_prob = model.predict_proba(X_test)[:, 1]

    evaluate_model(y_train, y_train_pred, y_train_pred_prob, y_test, y_test_pred, y_test_pred_prob)
    return model
def answer(test_path):

    import warnings
    warnings.filterwarnings("ignore")

    import time
    t0 = time.time()

    from learning import process_test_data, training_data, training_answers
    from sklearn.cluster.k_means_ import KMeans
    from sklearn.linear_model.logistic import LogisticRegression

    test_data = process_test_data(test_path)

    km = KMeans()
    km.fit(training_data, training_answers)

    myNum = km.predict(test_data).item()

    numX = [1, 2, 4, 2, 7, 0, 2, 7, 4, 3, 2, 1, 4, 5, 5, 1, 3, 0, 4, 2]
    numbers = [[num] for num in numX]
    letX = [
        'a', 'a', 'o', 'a', 'o', 'o', 'a', 'a', 'o', 'a', 'a', 'o', 'a', 'o',
        'o', 'o', 'a', 'a', 'o', 'a'
    ]
    letters = [[letter] for letter in letX]

    lr = LogisticRegression()
    lr.fit(numbers, letters)

    ans = lr.predict(myNum).item()

    t1 = time.time()
    return [ans, t1 - t0]
Beispiel #6
0
def mlogistic():
	X = []

	# 前三行作为输入样本
	X.append("f**k you")
	X.append("f**k you all")
	X.append("hello everyone")

	# 后两句作为测试样本
	X.append("f**k me")
	X.append("hello boy")

	# y为样本标注
	y = [1,1,0]

	vectorizer = TfidfVectorizer()

	# 取X的前三句作为输入做tfidf转换
	X_train = vectorizer.fit_transform(X[:-2])
	print X_train
	# 取X的后两句用“上句生成”的tfidf做转换
	X_test = vectorizer.transform(X[-2:])
	print X_test

	# 用逻辑回归模型做训练
	classifier = LogisticRegression()
	classifier.fit(X_train, y)

	# 做测试样例的预测
	predictions = classifier.predict(X_test)
	print predictions
Beispiel #7
0
def score(id):
    data = []
    mark = []
    with open(id, 'r', encoding='utf-8_sig') as f:
        csv_reader = csv.reader(f)
        for x in csv_reader:
            data.append(list(map(float, x[0:-1])))
            mark.append(float(x[-1]))
    acc = []
    auc = []
    f1 = []
    for i in range(10):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            data, mark, test_size=0.05, random_state=i)
        clf = LogisticRegression(C=4.8, random_state=1113)
        clf.fit(X_train, y_train)
        # print('准确率:',clf.score(X_test, y_test))
        acc.append(round(clf.score(X_test, y_test), 3))
        y_pred = clf.predict(X_test)
        # print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))
        auc.append(round(metrics.roc_auc_score(y_test, y_pred), 3))
        # print ('F1-score: %.4f' %metrics.f1_score(y_test,y_predict))
        f1.append(round(metrics.f1_score(y_test, y_pred), 3))
    acc.append(round(sum(acc) / len(acc), 3))
    auc.append(round(sum(auc) / len(auc), 3))
    f1.append(round(sum(f1) / len(f1), 3))
    return [auc, acc, f1]
def run_logistic_regression_multiclass_classification(train, train_labels,
                                                      validate,
                                                      validate_labels):
    logisticReg = LogisticRegression()
    logisticReg.fit(train, train_labels)
    predicted_labels = logisticReg.predict(validate)
    return metrics.accuracy_score(validate_labels, predicted_labels)
Beispiel #9
0
def logic_pca_standard(y, n):
    # 逻辑回归+降维+标准化
    pa = PCA(n_components=n)
    data = pa.fit_transform(train)
    # 分割数据
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=24)
    # 标准化
    std = StandardScaler()
    print(std)
    x_train = std.fit_transform(x_train)
    x_test = std.transform(x_test)
    # estimator
    logic = LogisticRegression()
    logic.fit(x_train, y_train)
    # 预测
    pre_score = logic.score(x_test, y_test)
    print("准确率(逻辑回归+降维+标准化):{}".format(pre_score))
    print(
        "精确率和召回率:",
        classification_report(y_test,
                              logic.predict(x_test),
                              labels=[0, 1],
                              target_names=["非高收入", "高收入"]))
    # 输出概率
    predictions = logic.predict_proba(x_test)
    # Compute Receiver operating characteristic (ROC)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1])
    auc_value = metrics.auc(fpr, tpr)
    print("auc值为:{}".format(auc_value))
Beispiel #10
0
class LogisticRegressionImpl():

    def __init__(self, penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=None):
        self._hyperparams = {
            'penalty': penalty,
            'dual': dual,
            'tol': tol,
            'C': C,
            'fit_intercept': fit_intercept,
            'intercept_scaling': intercept_scaling,
            'class_weight': class_weight,
            'random_state': random_state,
            'solver': solver,
            'max_iter': max_iter,
            'multi_class': multi_class,
            'verbose': verbose,
            'warm_start': warm_start,
            'n_jobs': n_jobs}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
Beispiel #11
0
 def method(self):
     lr = LogisticRegression()
     X, y = self.split()
     lr.fit(X, y)
     y_pred = lr.predict(X)
     print('Train data accuracy:', accuracy_score(y, y_pred))
     self.model = lr
Beispiel #12
0
def lrModel(xtrain, xtest, y):
    model = LogisticRegression()
    model.fit(xtrain, y)

    yHat = model.predict(xtest)

    return yHat
Beispiel #13
0
def Logistic_Regression_cls(preprocessing='PCA',
                            pre_kernel='rbf',
                            plot_result=False):
    if preprocessing == 'PCA':
        X, y = use_PCA('iris_data.txt')
    elif preprocessing == 'KPCA':
        X, y = use_KPCA('iris_data.txt', kernel=pre_kernel)
    elif preprocessing == 'LDA':
        X, y = use_LDA('iris_data.txt')
    elif preprocessing == 'None':
        loader = datasets.load_iris()
        X, y = loader['data'], loader['target']
    else:
        print(
            'Please choose a data preprocessing method from the following method:\n'
        )
        print('1.PCA, 2.KPCA, 3.LDA, 4.None')
        return

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        train_size=0.8)
    classifier = LogisticRegression(multi_class="multinomial",
                                    solver="newton-cg")
    classifier.fit(X_train, y_train)

    predict = classifier.predict(X_test)
    total = np.size(y_test)
    correct = 0

    for index, label in enumerate(y_test):
        if predict[index] == label:
            correct += 1
    accuracy = correct / total
    print("正确样本数为{}, 正确率为{:.4f}".format(correct, accuracy))

    if plot_result and preprocessing != 'None':
        fig1 = plt.subplot(1, 2, 1)
        fig1.set_title('raw data with label')
        for idx, y in enumerate(y_test):
            if y == 0:
                fig1.scatter(X_test[idx][0], X_test[idx][1], c='r')
            if y == 1:
                fig1.scatter(X_test[idx][0], X_test[idx][1], c='g')
            if y == 2:
                fig1.scatter(X_test[idx][0], X_test[idx][1], c='b')

        fig2 = plt.subplot(1, 2, 2)
        fig2.set_title('classification result')
        for idx, label in enumerate(predict):
            if label == 0:
                fig2.scatter(X_test[idx][0], X_test[idx][1], c='r')
            if label == 1:
                fig2.scatter(X_test[idx][0], X_test[idx][1], c='g')
            if label == 2:
                fig2.scatter(X_test[idx][0], X_test[idx][1], c='b')
        plt.show()

    return predict, accuracy
Beispiel #14
0
    def lr_model(self):
        logisticRegression = LogisticRegression()
        logisticRegression.fit(self.x_train, self.y_train)
        y_predicted = logisticRegression.predict(self.x_test)

        acc_score = accuracy_score(self.y_test, y_predicted)
        print("Accuracy Score for LR Model :", acc_score)
        self.model_dict.update({'lr': acc_score})
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    parser.add_argument('--iterations',type=int,default=5)

    args = parser.parse_args()


    all_feats = []
    all_labels = defaultdict(list)
    scores = defaultdict(list)




    for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "):
#    for idx in "01".split(" "):
        current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll"
        f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False,generateFeatures=False)
        for instance_index,l in enumerate(labels_current):
            all_labels[instance_index].append(l)
    current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_01.lbl.conll"
    feats, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=True,generateFeatures=True)

    for it in range(args.iterations):
        for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None):
            maxent = LogisticRegression(penalty='l2')

            TrainX_i = feats[TrainIndices]
            Trainy_i = [all_labels[x][random.randrange(0,20)] for x in TrainIndices]

            TestX_i = feats[TestIndices]
            Testy_i =  [all_labels[x][random.randrange(0,20)] for x in TestIndices]

            maxent.fit(TrainX_i,Trainy_i)
            ypred_i = maxent.predict(TestX_i)

            acc = accuracy_score(ypred_i, Testy_i)
            pre = precision_score(ypred_i, Testy_i)
            rec = recall_score(ypred_i, Testy_i)
            # shared task uses f1 of *accuracy* and recall!
            f1 = 2 * acc * rec / (acc + rec)

            scores["Accuracy"].append(acc)
            scores["F1"].append(f1)
            scores["Precision"].append(pre)
            scores["Recall"].append(rec)
        #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
        print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")

    sys.exit(0)
Beispiel #16
0
def lg(X, y, model_path):
    model = LogisticRegression()
    model.fit(X, y)
    print(model)
    expected = y
    predicted = model.predict(X)
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    joblib.dump(model, model_path)
Beispiel #17
0
def classify_logistic(train_features, train_labels, test_features):
    global SAVE
    clf = LogisticRegression()
    clf.fit(train_features, train_labels)

    if not TEST and SAVE:
        save_pickle("logistic", clf)

    return clf.predict(test_features)
Beispiel #18
0
def classify_logistic(train_features, train_labels, test_features):
    global SAVE
    clf = LogisticRegression()
    clf.fit(train_features, train_labels)

    if not TEST and SAVE:
        save_pickle("logistic", clf)

    return clf.predict(test_features)
Beispiel #19
0
def LRClassifier(data, y):
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(data, y)
    classifier = LogisticRegression(fit_intercept=True,
                                    intercept_scaling=0.0001)
    classifier.fit(X_train_raw, y_train)
    predictions = classifier.predict(X_test_raw)
    correct = 0
    for i in range(0, len(predictions)):
        print('prediction:%s. ActualY:%s' % (predictions[i], y_test[i]))
        if y_test[i] == predictions[i]:
            correct += 1
    print("The correction rate:  ", (correct / len(y_test)) * 100, "%")
Beispiel #20
0
def logistic_regression(X_train, Y_train, X_test, Y_test):
	'''
	train a logistic Regression model, and test its accuracy
	on some given test data
	'''
	from sklearn.linear_model.logistic import LogisticRegression
	classifier = LogisticRegression(solver='liblinear')
	classifier.fit(X_train, Y_train)

	LR_pred = classifier.predict(X_test)
	LR_acc = np.mean(LR_pred == Y_test)
	
	return LR_acc, classifier
Beispiel #21
0
def spamRecog(descr):
	df = pd.read_csv('./SMSSpamCollection.csv', delimiter='\t',header=None)

	X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])

	vectorizer = TfidfVectorizer()
	X_train = vectorizer.fit_transform(X_train_raw)
	classifier = LogisticRegression()
	classifier.fit(X_train, y_train)

	X_test = vectorizer.transform( [descr] )
	predictions = classifier.predict(X_test)
	return predictions
def test_liblinear_decision_function_zero():
    # Test negative prediction when decision_function values are zero.
    # Liblinear predicts the positive class when decision_function values
    # are zero. This is a test to verify that we do not do the same.
    # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600
    # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623
    X, y = make_classification(n_samples=5, n_features=5)
    clf = LogisticRegression(fit_intercept=False)
    clf.fit(X, y)

    # Dummy data such that the decision function becomes zero.
    X = np.zeros((5, 5))
    assert_array_equal(clf.predict(X), np.zeros(5))
Beispiel #23
0
def Logistic_Regression(x_train, y_train, x_test, y_test):
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train.astype('int'))
    y_predict = classifier.predict(x_test)
    total = 0
    right = 0
    for i in range(len(y_predict)):
        if y_predict[i] == y_test[i]:
            right += 1
        total += 1
    acc = float(right / total)
    print('Logistic Regression val accuarcy: ' + str(acc))
    return acc
Beispiel #24
0
def test_liblinear_decision_function_zero():
    # Test negative prediction when decision_function values are zero.
    # Liblinear predicts the positive class when decision_function values
    # are zero. This is a test to verify that we do not do the same.
    # See Issue: https://github.com/scikit-learn/scikit-learn/issues/3600
    # and the PR https://github.com/scikit-learn/scikit-learn/pull/3623
    X, y = make_classification(n_samples=5, n_features=5, random_state=0)
    clf = LogisticRegression(fit_intercept=False)
    clf.fit(X, y)

    # Dummy data such that the decision function becomes zero.
    X = np.zeros((5, 5))
    assert_array_equal(clf.predict(X), np.zeros(5))
def crossval(features, labels, variant):
    maxent = LogisticRegression(penalty='l2')
    dummyclass = DummyClassifier("most_frequent")
    scores = defaultdict(list)

    preds = []
    dummypreds = []
    shuffled_gold = []

    for TrainIndices, TestIndices in cross_validation.KFold(
            n=features.shape[0], n_folds=10, shuffle=True):
        # print(TestIndices)
        TrainX_i = features[TrainIndices]
        Trainy_i = labels[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i = labels[TestIndices]

        shuffled_gold.extend(Testy_i)

        dummyclass.fit(TrainX_i, Trainy_i)
        maxent.fit(TrainX_i, Trainy_i)

        ypred_i = maxent.predict(TestX_i)
        ydummypred_i = dummyclass.predict(TestX_i)
        dummypreds.extend(ydummypred_i)
        acc = accuracy_score(y_true=Testy_i, y_pred=ypred_i)
        f1 = f1_score(y_true=Testy_i, y_pred=ypred_i)
        scores["Accuracy"].append(acc)
        scores["F1"].append(f1)
        scores["Recall"].append(acc)
        scores["Accuracy_dummy"].append(
            accuracy_score(y_true=Testy_i, y_pred=ydummypred_i))
        scores["F1_dummy"].append(f1_score(y_true=Testy_i,
                                           y_pred=ydummypred_i))
        preds.extend(ypred_i)

    print("summary %s %.3f %.3f %.3f %.3f" %
          (variant, np.array(scores["Accuracy"]).mean(), np.array(
              scores["F1"]).mean(), np.array(scores["Accuracy_dummy"]).mean(),
           np.array(scores["F1_dummy"]).mean()))
    print(classification_report(y_pred=preds, y_true=shuffled_gold))

    labels_to_print = sorted(set(shuffled_gold))
    CM = confusion_matrix(y_pred=preds,
                          y_true=shuffled_gold,
                          labels=labels_to_print)
    print(sorted(set(shuffled_gold)))
    for l, r in zip(labels_to_print, CM):
        print(l, "\t".join([str(x) for x in r]))
    scores = None
Beispiel #26
0
class DetectMalicious():
    def __init__(self):
        benign_data = np.loadtxt('javascript-collection/benignjs.csv',
                                 delimiter=',',
                                 dtype=np.int32)
        evil_data = np.loadtxt('javascript-collection/eviljs.csv',
                               delimiter=',',
                               dtype=np.int32)
        train_data = np.concatenate((benign_data, evil_data), axis=0)
        self.score_template = 'TPR %(TPR)f\tFPR %(FPR)f\tAccuracy %(Accuracy)f\tAUC %(AUC)f'
        self.D = LogisticRegression()
        self.D.fit(train_data[:, :-1], train_data[:, -1])
        self.jsapi = []
        for line in open('javascript-collection/jsapi.txt'):
            self.jsapi.append(line.strip('\n'))

    def predict(self, X):
        flag = [0 for x in range(len(self.jsapi))]
        for i in range(len(self.jsapi)):
            if X.find(self.jsapi[i]) != -1:
                flag[i] = 1
        print self.D.predict([flag])
        return self.D.predict([flag])
def main():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    default_pool = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016--optimisation of threshhold")
    args = parser.parse_args()


    all_feats = []
    all_labels = []
    scores = defaultdict(list)

    for idx in "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20".split(" "):
#    for idx in "01".split(" "):
        current_single_ann = scriptdir+"/../data/cwi_training/cwi_training_"+idx+".lbl.conll"
        f_current, labels_current, v_current = feats_and_classify.collect_features(current_single_ann,vectorize=False)
        all_feats.extend(f_current)
        all_labels.extend(labels_current)

    feats = DictVectorizer().fit_transform(all_feats).toarray()
    all_labels = np.asarray(all_labels)
    for TrainIndices, TestIndices in cross_validation.KFold(n=feats.shape[0], n_folds=10, shuffle=True, random_state=None):
        maxent = LogisticRegression(penalty='l2')
        TrainX_i = feats[TrainIndices]
        Trainy_i = all_labels[TrainIndices]

        TestX_i = feats[TestIndices]
        Testy_i =  all_labels[TestIndices]

        maxent.fit(TrainX_i,Trainy_i)
        ypred_i = maxent.predict(TestX_i)

        acc = accuracy_score(ypred_i, Testy_i)
        pre = precision_score(ypred_i, Testy_i)
        rec = recall_score(ypred_i, Testy_i)
        # shared task uses f1 of *accuracy* and recall!
        f1 = 2 * acc * rec / (acc + rec)

        scores["Accuracy"].append(acc)
        scores["F1"].append(f1)
        scores["Precision"].append(pre)
        scores["Recall"].append(rec)
    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")

    sys.exit(0)
Beispiel #28
0
    def classify(self, input_file, output_file):
        df = pd.read_csv(input_file)
        df.columns = [
            'Id', 'OwnerUserId', 'CreationDate', 'ParentId', 'Score',
            'IsAcceptedAnswer', 'Body'
        ]
        df['Class'] = np.sign(df['Score'])
        print("Whole size: " + str(len(df)))

        i = 0
        while i < len(df['Body']):
            if df['Class'][i] <= 0:
                df['Class'][i] = 0

            change = re.sub(r"(</.*>)", "", df['Body'][i])
            change = re.sub(r"(<.*>)", "", change)

            stemmer = SnowballStemmer("english")
            splitted = re.split('\W+', change)
            singles = [stemmer.stem(word) for word in splitted]
            new_str = ""
            new_str += " ".join(singles)
            df['Body'][i] = new_str
            i = i + 1

        coef_test = 0.1
        X_train_raw, X_test_raw, y_train, y_test = train_test_split(
            df['Body'], df['Class'], test_size=coef_test)

        test_size = int(np.math.ceil(coef_test * len(df)))
        X = np.array(df["Class"])
        X_lately = X[-test_size:]
        vectorizer = TfidfVectorizer()
        X_train = vectorizer.fit_transform(X_train_raw)
        classifier = LogisticRegression()
        classifier.fit(X_train, y_train)
        X_test = vectorizer.transform(X_test_raw)
        predictions = classifier.predict(X_test)
        i = 0
        cnt_right = 0
        file = open(output_file, 'w')
        while i < len(predictions):
            file.write(str(predictions[i]))
            if predictions[i] == X_lately[i]:
                cnt_right = cnt_right + 1
            i = i + 1

        right_predictions = cnt_right / len(predictions)
        print("Accuracy of predictions: " + str(right_predictions) + "%")
Beispiel #29
0
def clazzify(train_mat, test_mat, true_train_labels):
    """
    """
    # learn
    logging.info('learning...')
    model = LogisticRegression(random_state=17, penalty='l1')
    model.fit(train_mat, true_train_labels)
    logging.info('finished learning.')

    # test
    logging.info('testing')
    predicted_test_labels = model.predict(test_mat)
    logging.info('finished testing')

    return predicted_test_labels, model
Beispiel #30
0
def clazzify(train_mat, test_mat, true_train_labels):
    """
    """
    # learn
    logging.info('learning...')
    model = LogisticRegression(random_state=17, penalty='l1')
    model.fit(train_mat, true_train_labels)
    logging.info('finished learning.')

    # test
    logging.info('testing')
    predicted_test_labels = model.predict(test_mat)
    logging.info('finished testing')

    return predicted_test_labels, model
Beispiel #31
0
def answer(test_path):
    import time
    t0 = time.time()

    from learning import process_test_data, training_data, training_answers
    from sklearn.linear_model.logistic import LogisticRegression

    test_data = process_test_data(test_path)

    lr = LogisticRegression()
    lr.fit(training_data, training_answers)

    ans = lr.predict(test_data).item()
    t1 = time.time()
    return [ans, t1 - t0]
Beispiel #32
0
class LogisticClassifier(Model):
    """Multi-label logistic classifier class."""
    def __init__(self, epochs):
        super(LogisticClassifier, self).__init__("logistic regression")
        self.max_epochs = epochs
        self.lr = LogisticRegression(max_iter=epochs)

    def train(self, train_x, train_y):
        print "Training {} model.......".format(self.name)
        self.lr.fit(train_x, train_y)
        print "Training complete!!"

    def test(self, test_x):
        test_y = self.lr.predict(test_x)
        print "Successfully generated predictions for test data."
        return test_y
Beispiel #33
0
def test_predict_iris():
    """Test logistic regression with the iris dataset"""
    n_samples, n_features = iris.data.shape

    target = iris.target_names[iris.target]
    clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target)
    assert_array_equal(np.unique(target), clf.classes_)

    pred = clf.predict(iris.data)
    assert_greater(np.mean(pred == target), .95)

    probabilities = clf.predict_proba(iris.data)
    assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples))

    pred = iris.target_names[probabilities.argmax(axis=1)]
    assert_greater(np.mean(pred == target), .95)
Beispiel #34
0
def test_predict_iris():
    """Test logistic regression with the iris dataset"""
    n_samples, n_features = iris.data.shape

    target = iris.target_names[iris.target]
    clf = LogisticRegression(C=len(iris.data)).fit(iris.data, target)
    assert_array_equal(np.unique(target), clf.classes_)

    pred = clf.predict(iris.data)
    assert_greater(np.mean(pred == target), .95)

    probabilities = clf.predict_proba(iris.data)
    assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples))

    pred = iris.target_names[probabilities.argmax(axis=1)]
    assert_greater(np.mean(pred == target), .95)
Beispiel #35
0
def train_and_predics(featuredicts, labels, trainsize):
    vec = DictVectorizer()
    y_train = labels[:trainsize]
    X_train = vec.fit_transform(featuredicts[:trainsize])
    X_test = vec.transform(featuredicts[trainsize:])
    maxent = LogisticRegression(penalty='l2')
    maxent.fit(X_train, y_train)
    predictions = []
    #header = "\t".join(["prediction"]+[str(c) for c in maxent.classes_])
    #predictions.append(header)
    for list, label in zip(maxent.predict_proba(X_test),
                           maxent.predict(X_test)):
        line = "\t".join([label] + ["{0:.2f}".format(k) for k in list])
        predictions.append(line)

    return predictions
def generate_submission():
    global alg, predictions, submission
    # The columns we'll use to predict the target
    # Initialize the algorithm class
    alg = LogisticRegression(random_state=1)
    # Train the algorithm using all the training data
    alg.fit(train[predictors], train["Survived"])
    # Make predictions using the test set.
    predictions = alg.predict(test[predictors])
    # Create a new dataframe with only the columns Kaggle wants from the dataset.
    submission = pandas.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": predictions
    })
    submission.to_csv("kaggle.csv", index=False)
    print("kaggele.csv is generated")
def Logistic_Regression(train, test):
    x_train = train[:, :-1]
    y_train = train[:, -1]
    x_test = test[:, :-1]
    y_test = test[:, -1]
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train)
    y_predict = classifier.predict(x_test)
    total = 0
    right = 0
    for i in range(len(y_predict)):
        if y_predict[i] == y_test[i]:
            right += 1
        total += 1
    acc = float(right / total)
    print('Logistic Regression train accuarcy: ' + str(acc))
    return acc
def test_multinomial_binary():
    """Test multinomial LR on a binary problem."""
    target = (iris.target > 0).astype(np.intp)
    target = np.array(["setosa", "not-setosa"])[target]

    clf = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    clf.fit(iris.data, target)

    assert_equal(clf.coef_.shape, (1, iris.data.shape[1]))
    assert_equal(clf.intercept_.shape, (1,))
    assert_array_equal(clf.predict(iris.data), target)

    mlr = LogisticRegression(solver='lbfgs', multi_class='multinomial',
                             fit_intercept=False)
    mlr.fit(iris.data, target)
    pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), axis=1)]
    assert_greater(np.mean(pred == target), .9)
Beispiel #39
0
class mentoryWEB:

    def __init__(self, file):
        self.vect = TfidfVectorizer(max_df=0.25, stop_words=None, max_features=2500, ngram_range=(1,2), use_idf=True, norm='l2')
        df = pd.read_csv(file, delimiter='\t', header=None)
        X_train_raw, y_train = df[1], df[0]

        X_train = self.vect.fit_transform(X_train_raw)

        self.clf = LogisticRegression(penalty='l2', C=10)
        self.clf.fit(X_train, y_train)


    def test(self, string):
        X_test = self.vect.transform([string])
        prediction = self.clf.predict(X_test)

        return prediction[0]
Beispiel #40
0
def LogisticRegressionSMSFilteringExample():
    import numpy as np
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model.logistic import LogisticRegression
    from sklearn.cross_validation import train_test_split, cross_val_score
    df = pd.read_csv('C:/Users/Ahmad/Documents/Mastering ML with Scikitlearn/ml/DataSets/smsspamcollection/SMSSpamCollection', delimiter='\t',header=None)
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1],df[0])
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train_raw)
    X_test = vectorizer.transform(X_test_raw)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)

    for i in xrange(0,5):
        print X_test_raw.values.tolist()[i],"\r\n Classification: ", predictions[i]
def makeClassificationAndMeasureAccuracy(genre_wise_train_data, genre_wise_test_data, meta_dict):
    accuracy_for_genre = dict()
    for genre in genre_wise_train_data:
        meta_dict_for_genre = meta_dict[genre]
        train_data, train_result = genre_wise_train_data[genre]
        test_data, test_result = genre_wise_test_data[genre]
        train_data = [list(meta_dict_for_genre[file_name][TAGS].values()) for file_name in train_data]
        test_data = [list(meta_dict_for_genre[file_name][TAGS].values()) for file_name in test_data]
        log_r = LogisticRegression()
        log_r.fit(train_data, train_result)
        accuracy = 0.0
        for i in range(len(test_data)):
            label = int(log_r.predict(test_data[i]))
            if label == test_result[i]:
                accuracy += 1.0
        accuracy = accuracy/len(test_data)
        accuracy_for_genre[genre] = accuracy
    return  accuracy_for_genre
def test_multinomial_binary():
    # Test multinomial LR on a binary problem.
    target = (iris.target > 0).astype(np.intp)
    target = np.array(["setosa", "not-setosa"])[target]

    for solver in ['lbfgs', 'newton-cg', 'sag']:
        clf = LogisticRegression(solver=solver, multi_class='multinomial',
                                 random_state=42, max_iter=2000)
        clf.fit(iris.data, target)

        assert_equal(clf.coef_.shape, (1, iris.data.shape[1]))
        assert_equal(clf.intercept_.shape, (1,))
        assert_array_equal(clf.predict(iris.data), target)

        mlr = LogisticRegression(solver=solver, multi_class='multinomial',
                                 random_state=42, fit_intercept=False)
        mlr.fit(iris.data, target)
        pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data),
                                      axis=1)]
        assert_greater(np.mean(pred == target), .9)
Beispiel #43
0
def test_multinomial_logistic_regression_string_inputs():
    # Test with string labels for LogisticRegression(CV)
    n_samples, n_features, n_classes = 50, 5, 3
    X_ref, y = make_classification(n_samples=n_samples, n_features=n_features,
                                   n_classes=n_classes, n_informative=3,
                                   random_state=0)
    y_str = LabelEncoder().fit(['bar', 'baz', 'foo']).inverse_transform(y)
    # For numerical labels, let y values be taken from set (-1, 0, 1)
    y = np.array(y) - 1
    # Test for string labels
    lr = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    lr_cv = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')
    lr_str = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    lr_cv_str = LogisticRegressionCV(solver='lbfgs', multi_class='multinomial')

    lr.fit(X_ref, y)
    lr_cv.fit(X_ref, y)
    lr_str.fit(X_ref, y_str)
    lr_cv_str.fit(X_ref, y_str)

    assert_array_almost_equal(lr.coef_, lr_str.coef_)
    assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
    assert_array_almost_equal(lr_cv.coef_, lr_cv_str.coef_)
    assert_equal(sorted(lr_str.classes_), ['bar', 'baz', 'foo'])
    assert_equal(sorted(lr_cv_str.classes_), ['bar', 'baz', 'foo'])

    # The predictions should be in original labels
    assert_equal(sorted(np.unique(lr_str.predict(X_ref))),
                 ['bar', 'baz', 'foo'])
    assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))),
                 ['bar', 'baz', 'foo'])

    # Make sure class weights can be given with string labels
    lr_cv_str = LogisticRegression(
        solver='lbfgs', class_weight={'bar': 1, 'baz': 2, 'foo': 0},
        multi_class='multinomial').fit(X_ref, y_str)
    assert_equal(sorted(np.unique(lr_cv_str.predict(X_ref))), ['bar', 'baz'])
Beispiel #44
0
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender',
             classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR',
             cv=10):
    instance_num = len(data_set_df.columns)
    df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label)
    x = df_filtered if features is None else df_filtered.loc[features]

    x = x.dropna(how='all', axis=0)
    x = x.dropna(how='all', axis=1)
    if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any():
        x_imp = pc.fill_nan_features(x)
        # x_imp = dense_df.loc[x.index, x.columns]
    else:
        x_imp = x
    y_filtered = y_v[(map(int, x.columns.values))]

    clf = LogisticRegression(C=reg_param) if classifier is None else classifier
    cv_num = min(len(y_filtered), cv)
    score_mean = 0.0
    miss_clf_rate = 1.0
    if cv_num > 1 and len(y_filtered.unique()) > 1:
        kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True)
        # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True)
        fold = 0
        result_str = ""
        matrix_str = ""
        for tr_index, te_index in kf:
            fold += 1
            x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index]
            y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index]

            if selection:
                if sel_method == 'LR' or 'RF' in sel_method:
                    feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                else:
                    x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T
                    feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat,
                                                        method=sel_method, label=label)
                x_train = x_train.loc[:, feat_index].values
                x_test = x_test.loc[:, feat_index].values

            try:
                clf.fit(x_train, y_train)
                score = clf.score(x_test, y_test)
                score_mean += score

                result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \
                              % (label, True if param.FILL_SUFFIX in feat_set_name else False,
                                 True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR',
                                 reg_param, cv, fold, x_train.shape[1], score)
                cf_mat = confusion_matrix(y_test, clf.predict(x_test),
                                          labels=range(len(info.LABEL_CATEGORY[label])))
                matrix_str += np.array_str(cf_mat) + "\n"
            except ValueError:
                pass
                # traceback.print_exc()
                # print i, "why error? skip!"

        print result_str
        file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(result_str)

        file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name)
        with open(file_name, mode='a') as f:
            f.write(matrix_str)

        if fold > 0:
            score_mean = score_mean / fold
            miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num)
    return score_mean, miss_clf_rate
def run():
    paras, sents = create_dataset()
    
    X = np.array(get_features(paras))
    Y = np.array(get_ys(paras))
    
    
    print len(X[0])
    
    sents = np.array(sents)
    
    skf = StratifiedKFold(Y, n_folds=10)
    
    f = open('results/correct.txt','w')
    f2 = open('results/wrong.txt','w')
    
    accs = []
    precs = []
    recs = []
    f1s = []
    
    for train_index, test_index in skf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
    
        sent_train = sents[train_index]
        sent_test = sents[test_index]
        
#         cv = CountVectorizer(stop_words="english", ngram_range=(1,1), min_df = 5)
#         sent_train_counts = cv.fit_transform(sent_train)
#         
#         tf_transformer = TfidfTransformer(use_idf=True).fit(sent_train_counts)
#         sent_train_counts = tf_transformer.transform(sent_train_counts)
#         
#         sent_train_counts = sent_train_counts.toarray()
#         
#         print sent_train_counts.shape
#         print X_train.shape
# 
#         new_train = []
#         for i,j in zip(X_train, sent_train_counts):
#             new_train.append(np.append(i,j))
        
        #fs = SelectKBest(chi2, k=24)
        #X_train = fs.fit_transform(X_train, y_train)
        
        clf = LogisticRegression()
        
        clf.fit(X_train, y_train)
        
        print clf.coef_
        
#         
#         sent_test_counts = cv.transform(sent_test)
#         sent_test_counts = tf_transformer.transform(sent_test_counts)
#         
#         sent_test_counts = sent_test_counts.toarray()
#         
#         new_test = []
#         for i,j in zip(X_test, sent_test_counts):
#             new_test.append(np.append(i,j))
        
        #X_test = fs.transform(X_test)
        
        y_pred = clf.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        accs.append(acc)
        precs.append(prec)
        recs.append(rec)
        f1s.append(f1)
        
        print 'Acc \t %s' % acc
        print 'Prec \t %s' % prec
        print 'Recall \t %s' % rec
        print 'F1 \t %s' % f1
        
        for (index,test),(y_t, y_p) in zip(zip(test_index, X_test), zip(y_test, y_pred)):
            if y_t == y_p:
#                 if paras[index]['prev_para']:
#                     f.write('%s\n' % paras[index]['prev_para']['sents'])
                f.write('%s\n' % sents[index])
                f.write('%s\n' % (y_t))
            else:
#                 if paras[index]['prev_para']:
#                     f2.write('%s\n' % paras[index]['prev_para']['sents'])
                f2.write('%s\n' % sents[index])
                f2.write('%s\n' % (y_t))
        
    print 'Avg Acc \t %s \t ' % np.mean(accs)
    print 'Avg Prec \t %s' % np.mean(precs)
    print 'Avg Recall \t %s' % np.mean(recs)
    print 'Avg F1 \t %s' % np.mean(f1s)
def train_model(clf_factory, X, Y, name, plot=False):
    """
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    cv = ShuffleSplit( n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0)
    #print "cv = ",cv
    train_errors = []
    test_errors = []

    scores = []

    pr_scores, precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores, tprs, fprs = defaultdict(list), defaultdict(list) ,defaultdict(list)

    clfs = []  # just to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]
        global clf
        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)
        
        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = precision_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            #print("Plotting %s"%genre_list[label])
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            #plot_pr(pr_scores[label][median], desc, precisions[label][median],recalls[label][median], label='%s vs rest' % genre_list[label])
            #plot_roc(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model_fft/my_model.pkl')
    
    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
from sklearn.cross_validation import train_test_split,cross_val_score

df = pd.read_csv('SMSSpamCollection',delimiter = '\t',header = None)
# print(df.head)
print('Number of spam messages :',df[df[0]=='spam'][0].count())
print('Number of ham messages:',df[df[0]=='ham'][0].count())

x_train_raw,x_test_raw,y_train,y_test = train_test_split(df[1],df[0])

vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train_raw)
x_test = vectorizer.transform(x_test_raw)

classifier = LogisticRegression()
classifier.fit(x_train,y_train)
predictions = classifier.predict(x_test)


# for i, prediction in enumerate(predictions[:5]):
#     print(prediction,x_test_raw[:i])

from sklearn.metrics import accuracy_score
print('Accuracy scores:',accuracy_score(y_test,predictions))


from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

conf_matrix=confusion_matrix(y_test,predictions)
print(conf_matrix)
plt.matshow(conf_matrix)
Beispiel #48
0
def crossval(features, labels, vec):
    maxent = LogisticRegression(penalty='l1')
    #maxent = SGDClassifier(penalty='l1')
    #maxent = Perceptron(penalty='l1')
    maxent.fit(features,labels) # only needed for feature inspection, crossvalidation calls fit(), too
    coeffcounter = Counter(vec.feature_names_)
    negfeats = set(vec.feature_names_)
    posfeats = set(vec.feature_names_)

    scores = defaultdict(list)
    TotalCoeffCounter = Counter()

    for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
        TrainX_i = features[TrainIndices]
        Trainy_i = labels[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i =  labels[TestIndices]

        maxent.fit(TrainX_i,Trainy_i)
        ypred_i = maxent.predict(TestX_i)
        coeffs_i = list(maxent.coef_[0])
        coeffcounter_i = Counter(vec.feature_names_)
        for value,name in zip(coeffs_i,vec.feature_names_):
            coeffcounter_i[name] = value

        acc = accuracy_score(ypred_i, Testy_i)
        pre = precision_score(ypred_i, Testy_i)
        rec = recall_score(ypred_i, Testy_i)
        # shared task uses f1 of *accuracy* and recall!
        f1 = 2 * acc * rec / (acc + rec)

        scores["Accuracy"].append(acc)
        scores["F1"].append(f1)
        scores["Precision"].append(pre)
        scores["Recall"].append(rec)

        posfeats = posfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[:20]]))
        negfeats = negfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[-20:]]))

    print("Pervasive positive: ", posfeats)
    print("Pervasive negative: ",negfeats)

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")

    maxent.fit(features,labels) # fit on everything

    coeffs_total = list(maxent.coef_[0])
    for value,name in zip(coeffs_total,vec.feature_names_):
            TotalCoeffCounter[name] = value

    for (key,value) in TotalCoeffCounter.most_common()[:20]:
        print(key,value)
    print("---")
    for (key,value) in TotalCoeffCounter.most_common()[-20:]:
        print(key,value)
    print("lowest coeff:",coeffcounter.most_common()[-1])
    print("highest coeff",coeffcounter.most_common()[0])
        
    return docs, t_docs, t_docsCategories


data = readData('hackerrank/documentClassification.txt')
X_train = np.array(data[1])
y_train = np.array(data[2])
X_test = np.array(data[0])
print("Extracting features from the training dataset using a sparse vectorizer")
#vectorizer = HashingVectorizer(stop_words='english', non_negative=True)
vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 2), 
 stop_words='english', 
 strip_accents='unicode', 
 norm='l2')
X_train = vectorizer.fit_transform(X_train)
#vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
#                                 stop_words='english')
#X2_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(X_test)

nb_classifier = MultinomialNB().fit(X_train, y_train)
svm_classifier = LinearSVC().fit(X_train, y_train)
maxent_classifier = LogisticRegression().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)
print(y_nb_predicted)
y_nb_predicted = svm_classifier.predict(X_test)
print(y_nb_predicted)
y_nb_predicted = maxent_classifier.predict(X_test)
print(y_nb_predicted)
Beispiel #50
0
    return lambda train, test: feat6_generic(train, test, tw_pos, blog_pos)
def feat6_tw():
    return lambda train, test: feat6_generic(train, test, tw_pos, twitter_test_pos)


print "Experiment 6: valence + punctuation + key POS word counts blog(80%) -> blog(20%)"
experiment6_b = experiment_svm_sigK(blog_80, blog_20, feat6_b())
print "Experiment 6: valence + punctuation + key POS word counts twitter+wiki -> blog"
experiment6_twb = experiment_svm_sigK(tw, blog, feat6_tw_b())
print "Experiment 6: valence + punctuation + key POS word counts twitter+wiki -> twitter(test)"
experiment6_tw = experiment_svm_sigK(tw, twitter_test, feat6_tw())


# Cross validation for blog -> blog experiment with best accuracy (to compare to original paper)
folds = KFold(n = len(blog), n_folds= 10, random_state = 1)
test_accuracies = []
for train_indices, test_indices in folds:
    train_data = get_elems_at(blog, train_indices)
    test_data = get_elems_at(blog, test_indices)
    data = Features.make_experiment_matrices(train_data, test_data, feat4)
    model = LogisticRegression()
    model.fit(data['train_X'], data['train_Y'])
    predictions = model.predict(data['test_X'])
    accuracy = accuracy_score(data['test_Y'], predictions)
    test_accuracies.append(accuracy)


print "10-CV accuracy blog on blog:%.2f[+/-%.2f]" % (numpy.mean(test_accuracies), numpy.std(test_accuracies))


def train_model(X, Y, name, plot=False):
    """
        train_model(vector, vector, name[, plot=False])
        
        Trains and saves model to disk.
    """
    labels = np.unique(Y)
    print labels

    cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0)
    
    
    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = []  # for the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = LogisticRegression()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    if plot:
        for label in labels:
            scores_to_sort = roc_scores[label]
            median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2]
            desc = "%s %s" % (name, genre_list[label])
            plot_roc_curves(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label])

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores))
    #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary)

    #save the trained model to disk
    joblib.dump(clf, 'saved_model/model_ceps.pkl')
    
    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
# X_test, y_test = X[N:], np.array(y[N:])


N_train = int(len(X)*6/10)
N_valid = int(len(X)*8/10)
X_train, y_train = X[:N_train], y[:N_train]
X_valid, y_valid = X[N_train:N_valid],y[N_train:N_valid]
X_test, y_test = X[N_valid:], np.array(y[N_valid:])

Cs = np.logspace(-2, 5, 10)
valid_predict = []

for C in Cs:
    estimator = LogisticRegression(class_weight='auto', C=C)
    estimator.fit(X_train, y_train)
    y_predict_val = estimator.predict(X_valid)
    valid_predict.append(1.0 * np.sum(y_predict_val == y_valid) / len(y_valid))

valid_predict = np.array(valid_predict)
C = Cs[np.argmax(valid_predict)]

print("C:",C, "Accurary(valid):", np.max(valid_predict))
# estimator = RandomForestClassifier(n_estimators=200)


estimator = LogisticRegression(class_weight='auto', C=C)
estimator.fit(X_train, y_train)
y_predict = estimator.predict(X_test)
print(y_predict)
print(y_test)
print ("Accurary(test):",1.0 * np.sum(y_predict == y_test) / len(y_test))
        ## 5:21 100-400 Hz
        X.append(2.0 / N * np.abs(yf[:N/2])[:21])
        # print xf[np.argmax(2.0/N * np.abs(yf[:N/2]))] ## pitch
        # plt.plot(xf, 2.0/N * np.abs(yf[:N/2]))
        # plt.show(block=True)
    return X


nasal = glob('/Users/lxy/Dropbox/Voice Autism Vocal Samples/Nasalized/*')
normal = glob('/Users/lxy/Dropbox/Voice Autism Vocal Samples/Normal/*')
random.shuffle(nasal)
random.shuffle(normal)
X, y = [], []
for filename in nasal:
    data = parseData(filename)
    X += data
    y += [1] * len(data)
for filename in normal:
    data = parseData(filename)
    X += data
    y += [0] * len(data)

N = len(X) * 9 / 10
X_train, y_train = X[:N], y[:N]
X_test, y_test = X[N:], np.array(y[N:])

# estimator = RandomForestClassifier(n_estimators=200)
estimator = LogisticRegression(class_weight='auto', C=8.0)
estimator.fit(X_train, y_train)
y_predict = estimator.predict(X_test)
print 1.0 * np.sum(y_predict == y_test) / len(y_test)
def main():
    parser = argparse.ArgumentParser(description="""Export AMT""")
    parser.add_argument('--input', default="../res/dga_extendedamt_simplemajority.tsv")
    parser.add_argument('--dump_to_predict', default="../res/dga_data_october2016.tsv")
    parser.add_argument('--embeddings', default="/Users/hmartine/data/glove.6B/glove.6B.50d.txt")
    args = parser.parse_args()

    E = load_embeddings(args.embeddings)

    predarrays = {}

    variants = ["bcd","cd"]
    for variant in variants:
    #1 collect features for train
        trainfeatures, labels, vec = collect_features(args.input,embeddings=E,variant=variant,vectorize=False)
        maxent = LogisticRegression(penalty='l2')
        #TODO collect features for new data
        #TODO proper vectorization
        dumpfeatdicts = features_from_dump(args.dump_to_predict,variant=variant,embeddings=E,bowfilter=trainingbow)
        #dumpfeats = vec.fit_transform(dumpfeatdicts)
        vec = DictVectorizer()
        X_train = vec.fit_transform(trainfeatures)

        maxent.fit(X_train,labels)
        X_test = vec.transform(dumpfeatdicts)

        predarrays[variant+"_pred_label"] = ["SAME" if x == 0 else "OMISSION" for x in maxent.predict(X_test)]
        predarrays[variant + "_pred_prob"] = ['{:.2}'.format(y) for x,y in maxent.predict_proba(X_test)]


    #maxent.fit(np.array(allfeatures[:len(labels)]),labels)
    #print(maxent.predict(allfeatures[len(labels):]))
    # predict using {features, features without lenght} --> instance 'variants' properly
        #TODO compare prediction similarity
        #TODO provide an output format with labels and probs for both feature templates
    frame = read_dump(args.dump_to_predict)
    keyindices = sorted(predarrays.keys())

    header = "Index Ref TitleRef URLRef Target TitleTarget URLTarget Source Contains BCD_label BCD_prob CD_label CD_prob".replace(" ","\t")

    print(header)
    for a in zip([str(x) for x in range(len(frame.Ref))],list(frame.Ref),list(frame.Target),list(frame.TitleRef),list(frame.URLRef),list(frame.TitleTarget),list(frame.URLTarget),list(frame.Source),list(frame.Contains),predarrays[keyindices[0]],predarrays[keyindices[1]],predarrays[keyindices[2]],predarrays[keyindices[3]]):
        print("\t".join(a))
def crossval(features, labels,variant,printcoeffs=False):
    maxent = LogisticRegression(penalty='l2')
    dummyclass = DummyClassifier("most_frequent")
    #maxent = SGDClassifier(penalty='l1')
    #maxent = Perceptron(penalty='l1')
    maxent.fit(features,labels) # only needed for feature inspection, crossvalidation calls fit(), too


    scores = defaultdict(list)
    TotalCoeffCounter = Counter()

    for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
        TrainX_i = features[TrainIndices]
        Trainy_i = labels[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i =  labels[TestIndices]
        dummyclass.fit(TrainX_i,Trainy_i)
        maxent.fit(TrainX_i,Trainy_i)

        ypred_i = maxent.predict(TestX_i)
        ydummypred_i = dummyclass.predict(TestX_i)
        #coeffs_i = list(maxent.coef_[0])
        #coeffcounter_i = Counter(vec.feature_names_)
        #for value,name in zip(coeffs_i,vec.feature_names_):
        #    coeffcounter_i[name] = value

        acc = accuracy_score(ypred_i, Testy_i)
        #pre = precision_score(ypred_i, Testy_i,pos_label=1)
        #rec = recall_score(ypred_i, Testy_i,pos_label=1)
        f1 = f1_score(ypred_i, Testy_i,pos_label=1)

        scores["Accuracy"].append(acc)
        scores["F1"].append(f1)
        #scores["Precision"].append(pre)
        #scores["Recall"].append(rec)

        #
        # acc = accuracy_score(ydummypred_i, Testy_i)
        # pre = precision_score(ydummypred_i, Testy_i,pos_label=1)
        # rec = recall_score(ydummypred_i, Testy_i,pos_label=1)
        # f1 = f1_score(ydummypred_i, Testy_i,pos_label=1)
        #
        # scores["dummy-Accuracy"].append(acc)
        # scores["dummy-F1"].append(f1)
        # scores["dummy-Precision"].append(pre)
        # scores["dummy-Recall"].append(rec)

        #posfeats = posfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[:20]]))
        #negfeats = negfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[-20:]]))

    #print("Pervasive positive: ", posfeats)
    #print("Pervasive negative: ",negfeats)

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    #print("--")
    #for key in sorted(scores.keys()):
    #    currentmetric = np.array(scores[key])
        #print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
        #print("%s : %0.2f" % (key,currentmetric.mean()))
    print("%s %.2f (%.2f)" % (variant,np.array(scores["Accuracy"]).mean(),np.array(scores["F1"]).mean()))
    if printcoeffs:

        maxent.fit(features,labels) # fit on everything

        coeffs_total = list(maxent.coef_[0])
        for (key,value) in TotalCoeffCounter.most_common()[:20]:
            print(key,value)
        print("---")
        for (key,value) in TotalCoeffCounter.most_common()[-20:]:
            print(key,value)
def test_fit_credit_backupsklearn():
    df = pd.read_csv("./open_data/creditcard.csv")
    X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C')
    y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C')
    Solver = h2o4gpu.LogisticRegression

    enet_h2o4gpu = Solver(glm_stop_early=False)
    print("h2o4gpu fit()")
    enet_h2o4gpu.fit(X, y)
    print("h2o4gpu predict()")
    print(enet_h2o4gpu.predict(X))
    print("h2o4gpu score()")
    print(enet_h2o4gpu.score(X,y))

    enet = Solver(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234)
    print("h2o4gpu scikit wrapper fit()")
    enet.fit(X, y)
    print("h2o4gpu scikit wrapper predict()")
    print(enet.predict(X))
    print("h2o4gpu scikit wrapper predict_proba()")
    print(enet.predict_proba(X))
    print("h2o4gpu scikit wrapper predict_log_proba()")
    print(enet.predict_log_proba(X))
    print("h2o4gpu scikit wrapper score()")
    print(enet.score(X,y))
    print("h2o4gpu scikit wrapper decision_function()")
    print(enet.decision_function(X))
    print("h2o4gpu scikit wrapper densify()")
    print(enet.densify())
    print("h2o4gpu scikit wrapper sparsify")
    print(enet.sparsify())
    
    from sklearn.linear_model.logistic import  LogisticRegression
    enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234)
    print("Scikit fit()")
    enet_sk.fit(X, y)
    print("Scikit predict()")
    print(enet_sk.predict(X))
    print("Scikit predict_proba()")
    print(enet_sk.predict_proba(X))
    print("Scikit predict_log_proba()")
    print(enet_sk.predict_log_proba(X))
    print("Scikit score()")
    print(enet_sk.score(X,y))
    print("Scikit decision_function()")
    print(enet_sk.decision_function(X))
    print("Scikit densify()")
    print(enet_sk.densify())
    print("Sciki sparsify")
    print(enet_sk.sparsify())

    enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray()
    print(enet_sk.coef_)
    print(enet_sk_coef)
    print(enet.coef_)
    print(enet_sk.intercept_)
    print("Coeffs, intercept, and n_iters should match")
    assert np.allclose(enet.coef_, enet_sk_coef)
    assert np.allclose(enet.intercept_, enet_sk.intercept_)
    assert np.allclose(enet.n_iter_, enet_sk.n_iter_)
    print("Preds should match")
    assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X))
    assert np.allclose(enet.predict(X), enet_sk.predict(X))
    assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
#decmap = defaultdict(int)
#for x,y in zip(test_labels,dec_pred1):
#    decmap[(x,y)] += 1
#for x,y in decmap.keys():
#    print 'Actual Decade : ' + str(x) + ' Decade Predicted ' + str(y) + ' Count : ' + str(decmap[(x,y)])

accuracy = zero_one_score(y_test, dec_pred1)
print 'Accuracy with SVM : ' + str(accuracy)

clf3 = MultinomialNB().fit(X_train,y_train)
dec_pred3 = clf3.predict(X_test)
accuracy = zero_one_score(y_test, dec_pred1)
print 'Accuracy with Naive Bayes : ' + str(accuracy)

n_neighbors = 15
clf2 = neighbors.KNeighborsClassifier(n_neighbors)
clf2 = clf2.fit(X_train,y_train)
dec_pred2 = clf2.predict(X_test)
accuracy = zero_one_score(y_test, dec_pred2)
print 'Accuracy with Nearest Neighbors : ' + str(accuracy)

clf2 = LogisticRegression().fit(X_train,y_train)
dec_pred2 = clf2.predict(X_test)
accuracy = zero_one_score(y_test, dec_pred2)
print 'Accuracy with Logistic Regression : ' + str(accuracy)


lyricsfile.close()    


Beispiel #58
0
def logreg_score(X, y):
    logreg = LogisticRegression()
    logreg.fit(X, y)
    y_pred = logreg.predict(X)
    print  "LogReg accuracy_score: {}".format(metrics.accuracy_score(y, y_pred))
def MyClassifier():
    scriptdir = os.path.dirname(os.path.realpath(__file__))
    defaultdata = scriptdir+"/../data/cwi_training/cwi_training.txt.lbl.conll"
    parser = argparse.ArgumentParser(description="Skeleton for features and classifier for CWI-2016")
    parser.add_argument('--train', help="parsed-and-label input format", default=defaultdata)
    args = parser.parse_args()

    labels = []
    featuredicts = []
    
    print("Collecting features...")
    count=0
    for s in readSentences(args.train):
       print("\r"+str(count), end="")
       count+=1
       for l,i in zip(s["label"],s["idx"]):
            if l != "-":
                w = WordInContext(s, i, s["form"][i],s["lemma"][i],s["pos"][i],s["ne"][i],l,s["head"],s["deprel"])
                featuredicts.append(w.featurize())
                labels.append(w.label)
    print()
    vec = DictVectorizer()
    features = vec.fit_transform(featuredicts).toarray()
    labels = np.array(labels)

    maxent = LogisticRegression(penalty='l1')
    #maxent = SGDClassifier(penalty='l1')
    #maxent = Perceptron(penalty='l1')
    maxent.fit(features,labels) # only needed for feature inspection, crossvalidation calls fit(), too
    coeffcounter = Counter(vec.feature_names_)
    negfeats = set(vec.feature_names_)
    posfeats = set(vec.feature_names_)



    scores = defaultdict(list)
    TotalCoeffCounter = Counter()

    for TrainIndices, TestIndices in cross_validation.KFold(n=features.shape[0], n_folds=10, shuffle=False, random_state=None):
        TrainX_i = features[TrainIndices]
        Trainy_i = labels[TrainIndices]

        TestX_i = features[TestIndices]
        Testy_i =  labels[TestIndices]

        maxent.fit(TrainX_i,Trainy_i)
        ypred_i = maxent.predict(TestX_i)
        coeffs_i = list(maxent.coef_[0])
        coeffcounter_i = Counter(vec.feature_names_)
        for value,name in zip(coeffs_i,vec.feature_names_):
            coeffcounter_i[name] = value

        scores["Accuracy"].append(accuracy_score(ypred_i,Testy_i))
        scores["F1"].append(f1_score(ypred_i,Testy_i))
        scores["Precision"].append(precision_score(ypred_i,Testy_i))
        scores["Recall"].append(recall_score(ypred_i,Testy_i))

        posfeats = posfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[:20]]))
        negfeats = negfeats.intersection(set([key for (key,value) in coeffcounter.most_common()[-20:]]))


    print("Pervasive positive: ", posfeats)
    print("Pervasive negative: ",negfeats)

    #scores = cross_validation.cross_val_score(maxent, features, labels, cv=10)
    print("--")

    for key in sorted(scores.keys()):
        currentmetric = np.array(scores[key])
        print("%s : %0.2f (+/- %0.2f)" % (key,currentmetric.mean(), currentmetric.std()))
    print("--")


    maxent.fit(features,labels) # fit on everything

    coeffs_total = list(maxent.coef_[0])
    for value,name in zip(coeffs_total,vec.feature_names_):
            TotalCoeffCounter[name] = value

    for (key,value) in TotalCoeffCounter.most_common()[:20]:
        print(key,value)
    print("---")
    for (key,value) in TotalCoeffCounter.most_common()[-20:]:
        print(key,value)
    print("lowest coeff:",coeffcounter.most_common()[-1])
    print("highest coeff",coeffcounter.most_common()[0])

    sys.exit(0)
Beispiel #60
0
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize the algorithm class
#alg = LogisticRegression(random_state=1)
alg = LogisticRegression()

# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
# Take the mean of the scores (because we have one for each fold)
print(scores.mean())


# Train the algorithm using all the training data
alg.fit(titanic[predictors], titanic["Survived"])

# Make predictions using the test set.
predictions = alg.predict(titanic_test[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("kaggle.csv", index=False)