Python train_classifier 예제들, classify.train_classifier Python 예제들

예제 #1

0

파일 보기

파일: sentiment.py 프로젝트: springunicorn/nLpDecipher

def run_script(tarfname, c=1000):
    from . import classify

    sentiment = read_files(tarfname)

    cls1 = classify.train_classifier(sentiment.trainX, sentiment.trainy, c,
                                     'l1', 'liblinear', 10000)
    cls2 = classify.train_classifier(sentiment.trainX, sentiment.trainy, c,
                                     'l2', 'lbfgs', 10000)
    return cls1, cls2, sentiment

예제 #2

0

파일 보기

def training_and_evaluation(sentiment, iteration, confidence):
    l = list(range(iteration + 1))
    l = l[1:]
    l[:] = [x * 0.1 for x in l]

    unlabeled = read_unlabeled(tarfname, sentiment)
    unlabeled_size = unlabeled.X.shape[0]

    # training the classifier only on the training data
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)

    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    # increase the proportion of unlabeled data by 10%, 20%, ... 100%
    for i in l:
        print('\nUnlabeled Data: ' + str(i * 100) + '%')
        unlabeled_y = write_pred_kaggle_file(unlabeled, cls,
                                             "data/sentiment-pred.csv",
                                             sentiment)

        # find the instances of unlabeled data which have been predicted with more than confidence%
        class_probabilities = cls.predict_proba(
            unlabeled.X[0:int(i * unlabeled_size)])
        idx = np.where(class_probabilities > confidence)

        C = unlabeled.X[0:int(i * unlabeled_size)]
        D = C.tocsr()
        D = D[idx[0], :]

        # build the new training set
        new_trainX = vstack((sentiment.trainX, D))
        new_trainy = np.concatenate((sentiment.trainy, unlabeled_y[idx[0]]),
                                    axis=0)
        print(new_trainX.shape)
        print(new_trainy.shape)

        # train the classifier on the expanded data
        cls = classify.train_classifier(new_trainX, new_trainy)
        print("Evaluating")
        yp_train = classify.evaluate(new_trainX, new_trainy, cls, 'train')
        yp_dev = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    interpretation(cls, sentiment, yp_train, yp_dev)
    i = 0
    j = 0
    while i < 10:
        if (yp_dev[j] != sentiment.devy[j]):
            print(sentiment.dev_data[j])
            i += 1
        j += 1
    return cls

예제 #3

0

파일 보기

파일: server_sentiment.py 프로젝트: Run4curry/cse156finalproject

def semi_supervised_learning(unlabeled, sentiment, f, iters):
    import classify
    import numpy as np
    from sklearn.utils import shuffle
    import matplotlib.pyplot as plt
    cls = classify.train_classifier(
        sentiment.trainX,
        sentiment.trainy)  # initial train with 0 unlabelled predicted
    initial_preds = cls.predict(unlabeled.X)
    factor = f  # roughly about 10% of the corpus

    #  print(type(sentiment.trainX))
    #  print(type(sentiment.trainy))
    unlabeled.data_temp = unlabeled.data

    for i in range(iters):

        end_index = min(len(unlabeled.data), (i * factor) + factor)
        partition = unlabeled.data_temp[i * factor:
                                        end_index]  # create partition of data
        #partition_matrix = sentiment.tfidf_vect.transform(partition) # create tfidf features on corpus
        partition_matrix = unlabeled.X[i * factor:end_index]
        yp = cls.predict(
            partition_matrix
        )  # predict on this partition of unseen data to create labels
        decisions = cls.decision_function(partition_matrix)
        # predict on unseen portion of data
        #for j in range(len(decisions)):
        #    print(decisions[j])
        #print(decisions)
        #print(decisions)
        # append this data to the train to create new train with labels
        for j in range(len(partition)):
            # check the confidence on each prediction before appending
            if (abs(decisions[j]) > 3.5):
                #print("HI")
                # print(partition[j])
                # print(yp[j])
                sentiment.train_data.append(partition[j])

                sentiment.trainy = np.append(sentiment.trainy, yp[j])
        #print(len(sentiment.train_data))
        #print(sentiment.trainy.shape)
        sentiment.trainX = sentiment.tfidf_vect.transform(
            sentiment.train_data
        )  # transform new training data with partition addition
        cls = classify.train_classifier(
            sentiment.trainX, sentiment.trainy)  # train a new classifier
        classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
        classify.evaluate(sentiment.devX, sentiment.devy, cls,
                          'dev')  # evaluate on dev portion

    return cls  # return this new classifier

예제 #4

0

파일 보기

파일: train.py 프로젝트: ZyWang666/NLPFinal

def train_part1_model():
	print("Reading data")
	tarfname = "data/sentiment.tar.gz"
	sentiment = read_files(tarfname)
	print("\nTraining classifier")
	cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
	save_model({'part1_vect.pk': sentiment.tfidf_vect, 'part1_model.pk': cls})

예제 #5

0

파일 보기

파일: speech_d2v.py 프로젝트: rokrokss/KAIST-Classes

def run(vector_size, window, iter, min_df, max_df):
    print("Reading data")
    data_loc = "data/"
    speech = read_files(data_loc, vector_size, window, iter, min_df, max_df)

    print("Training classifier")
    cls = classify.train_classifier(speech.train_doc_vec, speech.trainy)

    # cls = classify.semi_supervised_learning(cls, speech.train_doc_vec, speech.trainy, speech.unlabeled_doc_vec, speech.dev_doc_vec, speech.devy)

    print("Evaluating")
    train_acc = classify.evaluate(speech.train_doc_vec, speech.trainy, cls)
    dev_acc = classify.evaluate(speech.dev_doc_vec, speech.devy, cls)

    print("Writing Kaggle pred file")
    write_pred_kaggle_file(cls, "data/speech-pred.csv", speech)

    print("=================================")
    print("size: " + str(vector_size) + "   window: " + str(window) + "   iter: " + str(iter))
    print("min_df: " + str(min_df) + "   max_df: " + str(max_df))
    print("train_acc: " + str(train_acc))
    print("dev_acc: " + str(dev_acc))
    print("=================================")

    return 0

예제 #6

0

파일 보기

파일: label.py 프로젝트: rastogimitanshi/BtechProject

def create_label(test_doc):
    # test_doc=['travel-nontravel/tr3.txt']
    list_of_label = []
    for doc_path in test_doc:
        doc_matrix = extract_features_for_single_doc(doc_path)
        model1 = train_classifier()
        result = model1.predict(doc_matrix)
        list_of_label.append(result)

    return list_of_label

예제 #7

0

파일 보기

파일: main2.py 프로젝트: ZyWang666/NLPFinal

def getMyModel():
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)

    fns = sentiment.tfidf_vect.get_feature_names()
    fns = np.array(fns)
    display_features(fns, cls)

예제 #8

0

파일 보기

파일: sentiment.py 프로젝트: Ezra521/semi-supervised

def expand(sentiment, unlabeled, percent):
    print("Expanding {} * 10 percent...".format(percent))
    import classify
    labeled_data = sentiment.trainX
    labeled_label = sentiment.trainy
    iteration = 0

    iteration += 1
    print("Iteration when prediction {}: ".format(percent))

    # train labeled
    print("Training...")
    cls = classify.train_classifier(labeled_data, labeled_label)

    # predict unlabeled data
    print("Predicting...")
    predict_data = unlabeled.data
    to_predict = sentiment.count_vect.transform(predict_data)
    unlabeled_prediction = cls.predict(to_predict)
    prediction_prob = cls.predict_proba(to_predict)

    # choose most confident prediction p > percent
    print("Choosing most confident predictions...")
    change_list = []
    for i in range(len(prediction_prob)):
        if prediction_prob[i][0] > percent or prediction_prob[i][1] > percent:
            change_list.append(i)

    # expand confident prediction to labeled
    print("Expanding...")
    from scipy.sparse import vstack
    import numpy as np
    for i in change_list:
        labeled_data = vstack([labeled_data, to_predict[i, :]])
        labeled_label = np.append(labeled_label, [unlabeled_prediction[i]])

    # train labeled
    print("Training again...")
    cls = classify.train_classifier(labeled_data, labeled_label)

    return cls

예제 #9

0

파일 보기

파일: train.py 프로젝트: ZyWang666/NLPFinal

def train_part2_model():
    columns_train = preprocess("train.csv")
    data_train, y_train = generate_data(columns_train, 'Train')
    columns_test = preprocess("test.csv")
    columns_test_labels = preprocess("test_labels.csv")
    columns_test_labels["comment_text"] = columns_test["comment_text"]
    data_test, y_test = generate_data(columns_test_labels, 'Test')
    data = data_train + data_test
    tfidf_vect = TfidfVectorizer()
    tfidf_vect.fit(data)
    X_train = tfidf_vect.transform(data_train)
    cls = classify.train_classifier(X_train, y_train)
    save_model({'part2_vect.pk': tfidf_vect, 'part2_model.pk': cls})

예제 #10

0

파일 보기

def expand_data(speech):
    unlabeledX = speech.unlabeledX
    trainX = speech.trainX
    trainy = speech.trainy

    unlabeledX = sklearn.utils.shuffle(unlabeledX)

    total_unlabeled_count = unlabeledX.shape[0]

    best_clf = None
    best_acc = 0
    best_i = 0
    unlabeled_results = dict()
    n_samples = 100
    n_iterations = int(total_unlabeled_count / n_samples)

    print("Doing ", n_iterations, " iterations, with a sample size of ",
          n_samples)

    for i in range(n_iterations):
        clf = classify.train_classifier(trainX, trainy)

        # acc_before = evaluate(trainX, trainy, clf)

        newX = unlabeledX[:n_samples]
        unlabeledX = unlabeledX[n_samples:]
        newy = clf.predict(newX)

        trainX = scipy.sparse.vstack([trainX, newX])
        trainy = numpy.concatenate([trainy, newy])

        acc = classify.evaluate(speech.devX, speech.devy, clf)

        unlabeled_results[(i + 1) * n_samples] = acc

        if acc > best_acc:
            best_acc = acc
            best_clf = clf
            best_i = i

        print("Iteration: ", i, " Accuracy: ", acc)

    util.print_dict_tofile(unlabeled_results)
    print("Best accuracy: ", best_acc, " samples of unlabeled data used",
          (best_i + 1) * n_samples)

    return best_clf

예제 #11

0

파일 보기

파일: all_in_one_P2.py 프로젝트: jiajinliang/Sentiment-And-FakeNews-Detector

    def train(self):
        importlib.reload(sentimentinterface)
        print("Reading data")
        tarfname = "data/news.tar.gz"
        sentiment = sentimentinterface.read_data(tarfname)

        sentiment.stop_words = sentimentinterface.generate_stop_words(
            sentiment, diff=0.4)

        from sklearn.feature_extraction.text import CountVectorizer

        sentiment.cv = CountVectorizer(min_df=3)
        sentiment.cv.fit_transform(sentiment.train_data)
        sentiment.mindf_stop_words = sentiment.cv.stop_words_
        sentiment.cv = CountVectorizer(max_df=0.2)
        sentiment.cv.fit_transform(sentiment.train_data)
        sentiment.maxdf_stop_words = sentiment.cv.stop_words_
        sentiment.cv = CountVectorizer()
        sentiment.cv.fit_transform(sentiment.train_data)
        sentiment.training_set_vocabulary = sentiment.cv.vocabulary_

        sentimentinterface.vectorize_data(sentiment,
                                          stop_words=sentiment.stop_words,
                                          max_df=0.2,
                                          min_df=3)
        cls = classify.train_classifier(sentiment.trainX,
                                        sentiment.trainy,
                                        C=3.7)

        classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
        # print("\nReading unlabeled data")
        # unlabeled = sentimentinterface.read_unlabeled(tarfname, sentiment)
        # print("Writing predictions to a file")
        # sentimentinterface.write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

        # Logistic Regression Interception
        self.intercept = copy.deepcopy(cls.intercept_)[0]

        # Vectorizer vocaulary list (ordered)
        cv = sentiment.count_vect.vocabulary_
        cv = [(v, w) for w, v in cv.items()]
        cv.sort()
        cv = [x[1] for x in cv]
        self.cv = cv

        return sentiment, cls

예제 #12

0

파일 보기

파일: sample.py 프로젝트: phdowling/vector-models-eval

def main():
    try:
        dictionary = Dictionary.load_from_text("dictionary.txt")
    except:
        dictionary = Dictionary(rcv1_train)
        dictionary.filter_extremes()
        dictionary.save_as_text("dictionary.txt")

    class RCV1BowCorpus(object):
        def __iter__(self):
            for document in rcv1_train:
                yield dictionary.doc2bow(document)

    ln.debug("Training model on %s documents" % len(rcv1_train))
    try:
        vector_model = LsiModel.load("lsi_model")
    except:
        vector_model = LsiModel(corpus=RCV1BowCorpus(),
                                num_topics=100,
                                id2word=dictionary)
        vector_model.save("lsi_model")

    def get_lsi_features(text):
        """
        Must return either numpy array or dictionary
        """
        res = vector_model[dictionary.doc2bow(text)]
        return dict(res)

    def get_bow_features(text):
        return dict(dictionary.doc2bow(text))

    clf = train_classifier(train_samples=rcv1_train,
                           train_targets=rcv1_train_target,
                           get_features=get_lsi_features,
                           classifier="sgd")

    evaluate_classifier(clf,
                        rcv1_test,
                        rcv1_test_target,
                        get_features=get_lsi_features)

예제 #13

0

파일 보기

파일: sentiment.py 프로젝트: Ezra521/semi-supervised

def supervised():
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify

    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)

    from scipy.sparse import vstack
    test = vstack([sentiment.trainX, unlabeled.X])
    print(test.shape)
    print("Writing predictions to a file")
    write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv",
                           sentiment)
    decisive_features(cls, sentiment)

예제 #14

0

파일 보기

            (label, review) = line.strip().split("\t")
            i += 1
            f.write(str(i))
            f.write(",")
            f.write("POSITIVE")
            f.write("\n")
    f.close()


if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)
    print("Writing predictions to a file")
    write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv",
                           sentiment)
    #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv")

    # You can't run this since you do not have the true labels
    # print "Writing gold file"
    # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")

예제 #15

0

파일 보기

파일: sentiment.py 프로젝트: Run4curry/cse156finalproject

            f.write(",")
            f.write("POSITIVE")
            f.write("\n")
    f.close()

if __name__ == "__main__":
    if(len(sys.argv) != 2):
        print("Please enter two arguments")
        sys.exit(1)
    if(sys.argv[1] == "run_model"):
        print("Reading data")
        tarfname = "data/sentiment.tar.gz"
        sentiment = read_files(tarfname)
        print("\nTraining classifier")
        import classify
        cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
        print("\nEvaluating")
        classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
        classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

        print("\nReading unlabeled data")
        unlabeled = read_unlabeled(tarfname, sentiment)
        print(lexicon_stuff)
        cls = semi_supervised_learning(unlabeled, sentiment)
        print("Writing predictions to a file")
        write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)
        #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv")

        # You can't run this since you do not have the true labels
        # print "Writing gold file"
        # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")

예제 #16

0

파일 보기

파일: sentiment.py 프로젝트: springunicorn/nLpDecipher

def semi_supervise(sentiment, unlabeled, iter, num_conf):
    import classify
    best_dev = []
    # from scipy.sparse import vstack
    for i in range(iter):
        print("\nTraining classifier")
        sentiment = tfidfvectorizer_feat(sentiment)

        # reference: https://stackoverflow.com/questions/45232671/obtain-tf-idf-weights-of-words-with-sklearn
        index_value = {
            i[1]: i[0]
            for i in sentiment.count_vect.vocabulary_.items()
        }
        fully_indexed = {}
        for row in sentiment.trainX:
            for (column, value) in zip(row.indices, row.data):
                fully_indexed[index_value[column]] = value
        print(
            sorted(fully_indexed.items(), key=lambda x: x[1],
                   reverse=True)[:10])

        unlabeled.X = sentiment.count_vect.transform(unlabeled.data)
        cls = classify.train_classifier(sentiment.trainX, sentiment.trainy,
                                        1000)
        acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
        if i != 0:
            best_dev.append(acc)
        # preds = cls.predict(unlabeled.X)
        # conf_score = np.max(cls.predict_proba(unlabeled.X), axis=1)

        conf_score = np.apply_along_axis(
            lambda x: np.random.choice(x, 1, p=x)[0], 1,
            cls.predict_proba(unlabeled.X))
        preds = np.array([int(i >= 0.5) for i in conf_score])

        # conf_score = np.absolute(cls.decision_function(unlabeled.X))
        # conf_idx = np.argsort(conf_score)
        '''
        reference: https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array
        '''
        # def find_nearest(array, value):
        #     array = np.asarray(array)
        #     idx = (np.abs(array - value)).argmin()
        #     return idx

        sum_conf = np.sum(conf_score)
        conf_score = conf_score / sum_conf

        conf_idx = np.random.choice(list(range(len(conf_score))),
                                    num_conf,
                                    p=conf_score)
        # conf_idx = []
        # for i in conf_tmp:
        #     conf_idx.append(find_nearest(conf_score,i))

        # conf_idx = np.nonzero(conf_score > 0.99)[0]
        # print(len(conf_idx))
        # if len(conf_idx) < 1000:
        #     return unlabeled, cls, sentiment

        # new_labeled_X = np.array(unlabeled.data)[conf_idx[-num_conf:]]
        # new_labeled_y = preds[conf_idx[-num_conf:]]
        new_labeled_X = np.array(unlabeled.data)[conf_idx]
        new_labeled_y = preds[conf_idx]
        tmp_idx = [i for i in range(len(conf_score)) if i not in conf_idx]

        sentiment.train_data = np.concatenate(
            (sentiment.train_data, new_labeled_X))
        sentiment.trainy = np.concatenate((sentiment.trainy, new_labeled_y))
        # unlabeled.data = np.array(unlabeled.data)[conf_idx[:-num_conf]]
        unlabeled.data = np.array(unlabeled.data)[tmp_idx]
    return unlabeled, cls, sentiment, max(best_dev)

예제 #17

0

파일 보기

    tarfname = "data/speech.tar.gz"
    speech = read_files(tarfname)

    print("Training classifier")
    import classify
    C_range = [1, 10, 100, 1000]
    solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
    for solver in ["saga"]:
        print("Using " + solver)
        for c in [10]:
            print("Evaluating at C=" + str(c))
            for tfidf in [True]:
                print("With tfidf" if tfidf else "Without tfidf")
                cls = classify.train_classifier(
                    speech.trainX_tfidf if tfidf else speech.trainX,
                    speech.trainy,
                    c=c,
                    solver=solver)
                print("Acc on Training Data")
                classify.evaluate(
                    speech.trainX_tfidf if tfidf else speech.trainX,
                    speech.trainy, cls)
                print("Acc on Dev Data")
                classify.evaluate(speech.devX_tfidf if tfidf else speech.devX,
                                  speech.devy, cls)
                print("\n")

    print("Reading unlabeled data")
    unlabeled = read_unlabeled(tarfname, speech)
    # numBatches = 10
    # labeledXBatches = np.split(speech.trainX_tfidf.toarray(), numBatches)

예제 #18

0

파일 보기

파일: speech.py 프로젝트: zor077/uci-statnlp

    f.close()


def read_instance(tar, ifname):
    inst = tar.getmember(ifname)
    ifile = tar.extractfile(inst)
    content = ifile.read().strip()
    return content


if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/speech.tar.gz"
    speech = read_files(tarfname)
    print("Training classifier")
    import classify
    cls = classify.train_classifier(speech.trainX, speech.trainy)
    print("Evaluating")
    classify.evaluate(speech.trainX, speech.trainy, cls)
    classify.evaluate(speech.devX, speech.devy, cls)

    print("Reading unlabeled data")
    unlabeled = read_unlabeled(tarfname, speech)
    print("Writing pred file")
    write_pred_kaggle_file(unlabeled, cls, "data/speech-pred.csv", speech)

    # You can't run this since you do not have the true labels
    # print "Writing gold file"
    # write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv")
    # write_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv")

예제 #19

0

파일 보기

    f.close()


if __name__ == "__main__":
    tarfname = "data/sentiment.tar.gz"
    maxdf = 1.0
    mindf = 1
    solve_name = 'sag'
    penalty = 'l2'

    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname, min_df=mindf, max_df=maxdf)
    print("\nTraining classifier")
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)

    #probability =[0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.98]
    #for p in probability:

    cls = train_classifier(sentiment.trainX,
                           sentiment.trainy,
                           penalty=penalty,
                           solver=solve_name)

예제 #20

0

파일 보기

파일: main.py 프로젝트: brushershen/NLPFinal

        elif y == 'FALSE':
            X.append(x)
            Y.append(0)

    # The first 90% are train data, the last 10% are test data
    n = int(len(X) * .9)
    XY_train = list(zip(X, Y))[:n]
    XY_test = list(zip(X, Y))[n:]
    data_train, y_train = [x for x, y in XY_train], [y for x, y in XY_train]
    data_test, y_test = [x for x, y in XY_test], [y for x, y in XY_test]
    print("Train data has %d positive reviews" % y_train.count(1))
    print("Train data has %d negative reviews" % y_train.count(0))
    print("Test data has %d positive reviews" % y_test.count(1))
    print("Test data has %d negative reviews" % y_test.count(0))

    # Testing
    print("Testing CountVectorizer...")
    count_vect = CountVectorizer()
    count_vect.fit(data)
    X_train = count_vect.transform(data_train)
    X_test = count_vect.transform(data_test)
    cls = classify.train_classifier(X_train, y_train)
    classify.evaluate(X_test, y_test, cls, 'test')

    print("Testing TfidfVectorizer...")
    tfidf_vect = TfidfVectorizer()
    tfidf_vect.fit(data)
    X_train = tfidf_vect.transform(data_train)
    X_test = tfidf_vect.transform(data_test)
    cls = classify.train_classifier(X_train, y_train)
    classify.evaluate(X_test, y_test, cls, 'test')

예제 #21

0

파일 보기

              .format(prediction_result, prob_prediction, top_n, top_words, top_coef))
        
    print('Let\'s see it in a bar chart!')
    plt.figure
    plt.barh(top_words, top_coef)
    if do_we_reverse:
        plt.xlim(0, np.max(coefficients))
    else:
        plt.xlim(0, np.min(coefficients))
    plt.title('Weights assigned among different features')
    plt.ylabel('token')
    plt.xlabel('weight')
        

if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, C = 0.625)
    
    # make prediction on a new sample and then try to make explanations
    index = 1  # randomly select an index
    encoded_sentence = sentiment.trainX[index, :]
    # sentence = sentiment.train_data[index].split()
    
    classifier_explanation(cls, sentiment, encoded_sentence, top_n = 10)

예제 #22

0

파일 보기

파일: main.py 프로젝트: nonguyen1/BullyGuard

#!/bin/python

'''
Train a vanilla logistic regression.
'''
import classify
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import sentiment as sent

USE_BOG = True

if __name__ == "__main__":
    print("Reading data")
    datafile = "data/bayzick_clean.csv"
    sentiment = sent.read_files(datafile, use_bow=USE_BOG)

    print("\nTraining supervised classifier")
    cls, cv_results, c_list = classify.train_classifier(sentiment.trainX, sentiment.trainy, rtn_cv_results=True)

    import pickle
    pickle.dump(cls, open("lin_reg_unsup.pkl", "wb"))
    pickle.dump(sentiment, open("sen.pkl", "wb"))

    print("\nEvaluating Supervised")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

예제 #23

0

파일 보기

            f.write("POSITIVE")
            f.write("\n")
    f.close()


if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    test_acc, dev_acc, max_dev_acc, best_c, best_p = [], [], 0.0, 0.0, 'l2'
    testacc, devacc = [], []
    for c in [0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0]:
        for p in ['l1', 'l2']:
            cls = classify.train_classifier(sentiment.trainX, sentiment.trainy,
                                            c, p)
            print("\nEvaluating at C = ", c, " , Penalty = ", p)
            t_acc = classify.evaluate(sentiment.trainX, sentiment.trainy, cls,
                                      'train')
            d_acc = classify.evaluate(sentiment.devX, sentiment.devy, cls,
                                      'dev')
            if p == 'l1':
                test_acc.append(t_acc)
                dev_acc.append(d_acc)
            else:
                testacc.append(t_acc)
                devacc.append(d_acc)
            if d_acc > max_dev_acc:
                best_c = c
                best_p = p
                max_dev_acc = d_acc