Beispiel #1
0
def run(vector_size, window, iter, min_df, max_df):
    print("Reading data")
    data_loc = "data/"
    speech = read_files(data_loc, vector_size, window, iter, min_df, max_df)

    print("Training classifier")
    cls = classify.train_classifier(speech.train_doc_vec, speech.trainy)

    # cls = classify.semi_supervised_learning(cls, speech.train_doc_vec, speech.trainy, speech.unlabeled_doc_vec, speech.dev_doc_vec, speech.devy)

    print("Evaluating")
    train_acc = classify.evaluate(speech.train_doc_vec, speech.trainy, cls)
    dev_acc = classify.evaluate(speech.dev_doc_vec, speech.devy, cls)

    print("Writing Kaggle pred file")
    write_pred_kaggle_file(cls, "data/speech-pred.csv", speech)

    print("=================================")
    print("size: " + str(vector_size) + "   window: " + str(window) + "   iter: " + str(iter))
    print("min_df: " + str(min_df) + "   max_df: " + str(max_df))
    print("train_acc: " + str(train_acc))
    print("dev_acc: " + str(dev_acc))
    print("=================================")

    return 0
def semi_supervised_learning(unlabeled, sentiment, f, iters):
    import classify
    import numpy as np
    from sklearn.utils import shuffle
    import matplotlib.pyplot as plt
    cls = classify.train_classifier(
        sentiment.trainX,
        sentiment.trainy)  # initial train with 0 unlabelled predicted
    initial_preds = cls.predict(unlabeled.X)
    factor = f  # roughly about 10% of the corpus

    #  print(type(sentiment.trainX))
    #  print(type(sentiment.trainy))
    unlabeled.data_temp = unlabeled.data

    for i in range(iters):

        end_index = min(len(unlabeled.data), (i * factor) + factor)
        partition = unlabeled.data_temp[i * factor:
                                        end_index]  # create partition of data
        #partition_matrix = sentiment.tfidf_vect.transform(partition) # create tfidf features on corpus
        partition_matrix = unlabeled.X[i * factor:end_index]
        yp = cls.predict(
            partition_matrix
        )  # predict on this partition of unseen data to create labels
        decisions = cls.decision_function(partition_matrix)
        # predict on unseen portion of data
        #for j in range(len(decisions)):
        #    print(decisions[j])
        #print(decisions)
        #print(decisions)
        # append this data to the train to create new train with labels
        for j in range(len(partition)):
            # check the confidence on each prediction before appending
            if (abs(decisions[j]) > 3.5):
                #print("HI")
                # print(partition[j])
                # print(yp[j])
                sentiment.train_data.append(partition[j])

                sentiment.trainy = np.append(sentiment.trainy, yp[j])
        #print(len(sentiment.train_data))
        #print(sentiment.trainy.shape)
        sentiment.trainX = sentiment.tfidf_vect.transform(
            sentiment.train_data
        )  # transform new training data with partition addition
        cls = classify.train_classifier(
            sentiment.trainX, sentiment.trainy)  # train a new classifier
        classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
        classify.evaluate(sentiment.devX, sentiment.devy, cls,
                          'dev')  # evaluate on dev portion

    return cls  # return this new classifier
    def train(self):
        importlib.reload(sentimentinterface)
        print("Reading data")
        tarfname = "data/news.tar.gz"
        sentiment = sentimentinterface.read_data(tarfname)

        sentiment.stop_words = sentimentinterface.generate_stop_words(
            sentiment, diff=0.4)

        from sklearn.feature_extraction.text import CountVectorizer

        sentiment.cv = CountVectorizer(min_df=3)
        sentiment.cv.fit_transform(sentiment.train_data)
        sentiment.mindf_stop_words = sentiment.cv.stop_words_
        sentiment.cv = CountVectorizer(max_df=0.2)
        sentiment.cv.fit_transform(sentiment.train_data)
        sentiment.maxdf_stop_words = sentiment.cv.stop_words_
        sentiment.cv = CountVectorizer()
        sentiment.cv.fit_transform(sentiment.train_data)
        sentiment.training_set_vocabulary = sentiment.cv.vocabulary_

        sentimentinterface.vectorize_data(sentiment,
                                          stop_words=sentiment.stop_words,
                                          max_df=0.2,
                                          min_df=3)
        cls = classify.train_classifier(sentiment.trainX,
                                        sentiment.trainy,
                                        C=3.7)

        classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
        # print("\nReading unlabeled data")
        # unlabeled = sentimentinterface.read_unlabeled(tarfname, sentiment)
        # print("Writing predictions to a file")
        # sentimentinterface.write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)

        # Logistic Regression Interception
        self.intercept = copy.deepcopy(cls.intercept_)[0]

        # Vectorizer vocaulary list (ordered)
        cv = sentiment.count_vect.vocabulary_
        cv = [(v, w) for w, v in cv.items()]
        cv.sort()
        cv = [x[1] for x in cv]
        self.cv = cv

        return sentiment, cls
Beispiel #4
0
def semi_supervised():

    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)

    # try different percentage for best result
    percent_list = [0.5, 0.6, 0.7, 0.8, 0.9, 1]
    results = []
    for percent in percent_list:

        # do semi_supervised search on each percentage
        cls = expand(sentiment, unlabeled, percent)

        # evaluate on dev
        print("\nEvaluating")
        import classify
        acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
        results.append(acc)

    best = 0
    best_index = -1
    for result in results:
        if result > best:
            best = result
            best_index = results.index(result)
    print("Best result is {} when processing {} percent as confident".format(
        best, best_index))

    # train on best index
    cls = expand(sentiment, unlabeled, best_index)
    print("\nEvaluating on best percentage...")
    import classify
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("Writing predictions to a file")
    write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv",
                           sentiment)
Beispiel #5
0
def supervised():
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify

    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)

    from scipy.sparse import vstack
    test = vstack([sentiment.trainX, unlabeled.X])
    print(test.shape)
    print("Writing predictions to a file")
    write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv",
                           sentiment)
    decisive_features(cls, sentiment)
Beispiel #6
0
def expand_data(speech):
    unlabeledX = speech.unlabeledX
    trainX = speech.trainX
    trainy = speech.trainy

    unlabeledX = sklearn.utils.shuffle(unlabeledX)

    total_unlabeled_count = unlabeledX.shape[0]

    best_clf = None
    best_acc = 0
    best_i = 0
    unlabeled_results = dict()
    n_samples = 100
    n_iterations = int(total_unlabeled_count / n_samples)

    print("Doing ", n_iterations, " iterations, with a sample size of ",
          n_samples)

    for i in range(n_iterations):
        clf = classify.train_classifier(trainX, trainy)

        # acc_before = evaluate(trainX, trainy, clf)

        newX = unlabeledX[:n_samples]
        unlabeledX = unlabeledX[n_samples:]
        newy = clf.predict(newX)

        trainX = scipy.sparse.vstack([trainX, newX])
        trainy = numpy.concatenate([trainy, newy])

        acc = classify.evaluate(speech.devX, speech.devy, clf)

        unlabeled_results[(i + 1) * n_samples] = acc

        if acc > best_acc:
            best_acc = acc
            best_clf = clf
            best_i = i

        print("Iteration: ", i, " Accuracy: ", acc)

    util.print_dict_tofile(unlabeled_results)
    print("Best accuracy: ", best_acc, " samples of unlabeled data used",
          (best_i + 1) * n_samples)

    return best_clf
Beispiel #7
0
def training_and_evaluation(sentiment, iteration, confidence):
    l = list(range(iteration + 1))
    l = l[1:]
    l[:] = [x * 0.1 for x in l]

    unlabeled = read_unlabeled(tarfname, sentiment)
    unlabeled_size = unlabeled.X.shape[0]

    # training the classifier only on the training data
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)

    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    # increase the proportion of unlabeled data by 10%, 20%, ... 100%
    for i in l:
        print('\nUnlabeled Data: ' + str(i * 100) + '%')
        unlabeled_y = write_pred_kaggle_file(unlabeled, cls,
                                             "data/sentiment-pred.csv",
                                             sentiment)

        # find the instances of unlabeled data which have been predicted with more than confidence%
        class_probabilities = cls.predict_proba(
            unlabeled.X[0:int(i * unlabeled_size)])
        idx = np.where(class_probabilities > confidence)

        C = unlabeled.X[0:int(i * unlabeled_size)]
        D = C.tocsr()
        D = D[idx[0], :]

        # build the new training set
        new_trainX = vstack((sentiment.trainX, D))
        new_trainy = np.concatenate((sentiment.trainy, unlabeled_y[idx[0]]),
                                    axis=0)
        print(new_trainX.shape)
        print(new_trainy.shape)

        # train the classifier on the expanded data
        cls = classify.train_classifier(new_trainX, new_trainy)
        print("Evaluating")
        yp_train = classify.evaluate(new_trainX, new_trainy, cls, 'train')
        yp_dev = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    interpretation(cls, sentiment, yp_train, yp_dev)
    i = 0
    j = 0
    while i < 10:
        if (yp_dev[j] != sentiment.devy[j]):
            print(sentiment.dev_data[j])
            i += 1
        j += 1
    return cls
    # Define a pipeline
    text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
                         ('tfidf',
                          TfidfTransformer(use_idf=True,
                                           smooth_idf=False,
                                           sublinear_tf=True)),
                         ('clf',
                          LogisticRegression(random_state=0,
                                             C=512,
                                             solver='saga',
                                             max_iter=1000))])

    print("\nTraining Supervised classifier")
    text_clf.fit(sentiment.train_data, sentiment.trainy)
    classify.evaluate(sentiment.train_data, sentiment.trainy, text_clf,
                      'train')
    classify.evaluate(sentiment.dev_data, sentiment.devy, text_clf, 'dev')

    print('\nTraining Word2Vec')
    w2v = gensim.models.Word2Vec(list(unlabeled.data),
                                 size=200,
                                 window=10,
                                 min_count=3,
                                 iter=20)
    train_data = [sentence_vector(sent, w2v) for sent in sentiment.train_data]
    dev_data = [sentence_vector(sent, w2v) for sent in sentiment.dev_data]
    print("\nTraining Word2Vec Supervised classifier")

    clf = LogisticRegression(random_state=0,
                             C=100,
                             solver='saga',
Beispiel #9
0

if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    test_acc, dev_acc, max_dev_acc, best_c, best_p = [], [], 0.0, 0.0, 'l2'
    testacc, devacc = [], []
    for c in [0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0]:
        for p in ['l1', 'l2']:
            cls = classify.train_classifier(sentiment.trainX, sentiment.trainy,
                                            c, p)
            print("\nEvaluating at C = ", c, " , Penalty = ", p)
            t_acc = classify.evaluate(sentiment.trainX, sentiment.trainy, cls,
                                      'train')
            d_acc = classify.evaluate(sentiment.devX, sentiment.devy, cls,
                                      'dev')
            if p == 'l1':
                test_acc.append(t_acc)
                dev_acc.append(d_acc)
            else:
                testacc.append(t_acc)
                devacc.append(d_acc)
            if d_acc > max_dev_acc:
                best_c = c
                best_p = p
                max_dev_acc = d_acc

    print("\nBest c: ", best_c, ", Best penalty: ", best_p, " | Accuracy: ",
          max_dev_acc)
            f.write("\n")
    f.close()

if __name__ == "__main__":
    if(len(sys.argv) != 2):
        print("Please enter two arguments")
        sys.exit(1)
    if(sys.argv[1] == "run_model"):
        print("Reading data")
        tarfname = "data/sentiment.tar.gz"
        sentiment = read_files(tarfname)
        print("\nTraining classifier")
        import classify
        cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
        print("\nEvaluating")
        classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
        classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

        print("\nReading unlabeled data")
        unlabeled = read_unlabeled(tarfname, sentiment)
        print(lexicon_stuff)
        cls = semi_supervised_learning(unlabeled, sentiment)
        print("Writing predictions to a file")
        write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv", sentiment)
        #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv")

        # You can't run this since you do not have the true labels
        # print "Writing gold file"
        # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")
    if(sys.argv[1] == "final"):
        print("Reading data")
Beispiel #11
0
    x_train, x_test, y_train, y_test = preprocess.run(votes, test_clip)

    # Classify using SVM with different kernels
    y_pred_linear = classify.SVM_linear(x_train, x_test, y_train)

    y_pred_rbf = classify.SVM_rbf(x_train, x_test, y_train)

    # Combine classifiers
    y_final = classify.combine(y_pred_linear, y_pred_rbf)

    # Make very engaged -> engaged.
    y_final = ignore_very(y_final)
    y_test = ignore_very(y_test)

    # Evaluate model
    accuracy_svm += classify.evaluate(y_final, y_test, "SVM",
                                      test_clip) * len(y_test)

    # Classify using dummy to get baseline
    y_dummy = classify.dummy(x_train, x_test, y_train)

    # Evaluate dummy
    accuracy_dummy += classify.evaluate(y_dummy, y_test, "Dummy",
                                        test_clip) * len(y_test)

    # Count number of clips
    number_of_clips += len(y_test)

    # matrix
    classify.matrix(y_test, y_final)

print("Final results:")
Beispiel #12
0
def greedy_searchpara(text_clf, sentiment, tarfname):
    # Greedy Search Parameter
    parameters = {
        # 'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (5, 5)],      # (1, 3) is best
        # 'tfidf__use_idf': [(True, False), (True, True), (False, True), ((False, False))],
        'clf__C': [2**(i) for i in range(-10, 15)],  # 512 is best
        # 'clf__class_weight': [None, 'balanced'],  # None is better
        # 'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],    # 'saga' is better
        # 'clf__max_iter': [10**i for i in range(2, 8)],    # iteration 1000
    }
    from sklearn.metrics import make_scorer
    from sklearn.metrics import accuracy_score
    scoring = {'Accuracy': make_scorer(accuracy_score)}
    gs_clf = GridSearchCV(text_clf,
                          parameters,
                          cv=5,
                          iid=False,
                          n_jobs=-1,
                          scoring=scoring,
                          refit='Accuracy',
                          return_train_score=True)
    gs_clf = gs_clf.fit(sentiment.train_data, sentiment.trainy)
    print(gs_clf.best_score_)
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

    results = gs_clf.cv_results_

    # plotting the result
    plt.figure(figsize=(13, 13))
    # plt.title("GridSearchCV evaluating using multiple scorers simultaneously",
    # fontsize=16)

    plt.xlabel(
        "the inverse of regularization strength for LogisticRegression Model")
    plt.ylabel("Score")

    ax = plt.gca()

    # Get the regular numpy array from the MaskedArray
    X_axis = np.array(results['param_clf__C'].data, dtype=float)

    for scorer, color in zip(sorted(scoring), ['g', 'k']):
        for sample, style in (('train', '--'), ('test', '-')):
            sample_score_mean = results['mean_%s_%s' % (sample, scorer)]
            sample_score_std = results['std_%s_%s' % (sample, scorer)]
            ax.fill_between(X_axis,
                            sample_score_mean - sample_score_std,
                            sample_score_mean + sample_score_std,
                            alpha=0.1 if sample == 'test' else 0,
                            color=color)
            ax.plot(X_axis,
                    sample_score_mean,
                    style,
                    color=color,
                    alpha=1 if sample == 'test' else 0.7,
                    label="%s (%s)" % (scorer, sample))

        best_index = np.nonzero(results['rank_test_%s' % scorer] == 1)[0][0]
        best_score = results['mean_test_%s' % scorer][best_index]

        # Plot a dotted vertical line at the best score for that scorer marked by x
        ax.plot([
            X_axis[best_index],
        ] * 2, [0, best_score],
                linestyle='-.',
                color=color,
                marker='x',
                markeredgewidth=3,
                ms=8)

        # Annotate the best score for that scorer
        ax.annotate("%0.2f" % best_score,
                    (X_axis[best_index], best_score + 0.005))

        plt.xscale('log')
        plt.legend(loc="best")
        plt.grid(False)
        plt.show()

    # Evaluate on the refit model
    classify.evaluate(sentiment.train_data, sentiment.trainy, gs_clf, 'train')
    classify.evaluate(sentiment.dev_data, sentiment.devy, gs_clf, 'dev')

    # Evaluate on the unlabeled data
    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)
    print("Writing predictions to a file")
    write_pred_kaggle_file(unlabeled, gs_clf, "data/sentiment-pred.csv",
                           sentiment)
Beispiel #13
0
    C_range = [1, 10, 100, 1000]
    solvers = ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
    for solver in ["saga"]:
        print("Using " + solver)
        for c in [10]:
            print("Evaluating at C=" + str(c))
            for tfidf in [True]:
                print("With tfidf" if tfidf else "Without tfidf")
                cls = classify.train_classifier(
                    speech.trainX_tfidf if tfidf else speech.trainX,
                    speech.trainy,
                    c=c,
                    solver=solver)
                print("Acc on Training Data")
                classify.evaluate(
                    speech.trainX_tfidf if tfidf else speech.trainX,
                    speech.trainy, cls)
                print("Acc on Dev Data")
                classify.evaluate(speech.devX_tfidf if tfidf else speech.devX,
                                  speech.devy, cls)
                print("\n")

    print("Reading unlabeled data")
    unlabeled = read_unlabeled(tarfname, speech)
    # numBatches = 10
    # labeledXBatches = np.split(speech.trainX_tfidf.toarray(), numBatches)
    # labeledYBatches = np.split(speech.trainy, numBatches)
    # unlabeledXBatches = np.split(
    #     unlabeled.X.toarray()[:-2], numBatches)
    # trainXBatches = [None] * numBatches
    # trainYBatches = [None] * numBatches
Beispiel #14
0
def semi_supervise(sentiment, unlabeled, iter, num_conf):
    import classify
    best_dev = []
    # from scipy.sparse import vstack
    for i in range(iter):
        print("\nTraining classifier")
        sentiment = tfidfvectorizer_feat(sentiment)

        # reference: https://stackoverflow.com/questions/45232671/obtain-tf-idf-weights-of-words-with-sklearn
        index_value = {
            i[1]: i[0]
            for i in sentiment.count_vect.vocabulary_.items()
        }
        fully_indexed = {}
        for row in sentiment.trainX:
            for (column, value) in zip(row.indices, row.data):
                fully_indexed[index_value[column]] = value
        print(
            sorted(fully_indexed.items(), key=lambda x: x[1],
                   reverse=True)[:10])

        unlabeled.X = sentiment.count_vect.transform(unlabeled.data)
        cls = classify.train_classifier(sentiment.trainX, sentiment.trainy,
                                        1000)
        acc = classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')
        if i != 0:
            best_dev.append(acc)
        # preds = cls.predict(unlabeled.X)
        # conf_score = np.max(cls.predict_proba(unlabeled.X), axis=1)

        conf_score = np.apply_along_axis(
            lambda x: np.random.choice(x, 1, p=x)[0], 1,
            cls.predict_proba(unlabeled.X))
        preds = np.array([int(i >= 0.5) for i in conf_score])

        # conf_score = np.absolute(cls.decision_function(unlabeled.X))
        # conf_idx = np.argsort(conf_score)
        '''
        reference: https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array
        '''
        # def find_nearest(array, value):
        #     array = np.asarray(array)
        #     idx = (np.abs(array - value)).argmin()
        #     return idx

        sum_conf = np.sum(conf_score)
        conf_score = conf_score / sum_conf

        conf_idx = np.random.choice(list(range(len(conf_score))),
                                    num_conf,
                                    p=conf_score)
        # conf_idx = []
        # for i in conf_tmp:
        #     conf_idx.append(find_nearest(conf_score,i))

        # conf_idx = np.nonzero(conf_score > 0.99)[0]
        # print(len(conf_idx))
        # if len(conf_idx) < 1000:
        #     return unlabeled, cls, sentiment

        # new_labeled_X = np.array(unlabeled.data)[conf_idx[-num_conf:]]
        # new_labeled_y = preds[conf_idx[-num_conf:]]
        new_labeled_X = np.array(unlabeled.data)[conf_idx]
        new_labeled_y = preds[conf_idx]
        tmp_idx = [i for i in range(len(conf_score)) if i not in conf_idx]

        sentiment.train_data = np.concatenate(
            (sentiment.train_data, new_labeled_X))
        sentiment.trainy = np.concatenate((sentiment.trainy, new_labeled_y))
        # unlabeled.data = np.array(unlabeled.data)[conf_idx[:-num_conf]]
        unlabeled.data = np.array(unlabeled.data)[tmp_idx]
    return unlabeled, cls, sentiment, max(best_dev)
Beispiel #15
0
if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/speech.tar.gz"
    speech = read_files(tarfname)
    print("Training classifier")
    import classify
    # #C = [100,50,20,10,9,8,7,5,4,3,2,1,0.9,0.8,0.7,0.6,0.5,0.4,0.3]
    # C = [1000,500,300,200,150,120,110,105,100, 50, 20, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    # i_c = [1/i for i in C]
    train_accs = []
    test_accs = []
    c = 1
    # for c in C:
    cls = classify.train_classifier(speech.trainX, speech.trainy, c)
    confusion_mtrx, train_acc = classify.evaluate(speech.trainX, speech.trainy,
                                                  cls)
    train_accs.append(train_acc)
    confusion_mtrx, test_acc = classify.evaluate(speech.devX, speech.devy, cls)
    test_accs.append(test_acc)
    #plot_confusion_matrix(confusion_mtrx,speech.le.classes_)

    #get important features for class
    for i in range(0, cls.coef_.shape[0]):
        top10_indices = np.argsort(cls.coef_[i])[-10:]
        top10_feature = []
        print(speech.le.classes_[i])
        for idx in top10_indices:
            for word in speech.count_vect.vocabulary_:
                if (speech.count_vect.vocabulary_[word] == idx):
                    top10_feature.append(word)
        print(top10_feature)
Beispiel #16
0
if __name__ == "__main__":
    tarfname = "data/sentiment.tar.gz"
    maxdf = 1.0
    mindf = 1
    solve_name = 'sag'
    penalty = 'l2'

    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname, min_df=mindf, max_df=maxdf)
    print("\nTraining classifier")
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)

    #probability =[0.6,0.7,0.75,0.8,0.85,0.9,0.95,0.98]
    #for p in probability:

    cls = train_classifier(sentiment.trainX,
                           sentiment.trainy,
                           penalty=penalty,
                           solver=solve_name)

    acc = evaluate(sentiment.devX, sentiment.devy, cls, 'dev data')
    print('when using using min_df = {}, MAX_DF ={},acc : {}'.format(
Beispiel #17
0
    f.close()


def read_instance(tar, ifname):
    inst = tar.getmember(ifname)
    ifile = tar.extractfile(inst)
    content = ifile.read().strip()
    return content


if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/speech.tar.gz"
    speech = read_files(tarfname)
    print("Training classifier")
    import classify
    cls = classify.train_classifier(speech.trainX, speech.trainy)
    print("Evaluating")
    classify.evaluate(speech.trainX, speech.trainy, cls)
    classify.evaluate(speech.devX, speech.devy, cls)

    print("Reading unlabeled data")
    unlabeled = read_unlabeled(tarfname, speech)
    print("Writing pred file")
    write_pred_kaggle_file(unlabeled, cls, "data/speech-pred.csv", speech)

    # You can't run this since you do not have the true labels
    # print "Writing gold file"
    # write_gold_kaggle_file("data/speech-unlabeled.tsv", "data/speech-gold.csv")
    # write_basic_kaggle_file("data/speech-unlabeled.tsv", "data/speech-basic.csv")
            f.write(",")
            f.write("POSITIVE")
            f.write("\n")
    f.close()


if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    test_acc, dev_acc, max_dev_acc, best_c, best_p = [], [], 0.0, 0.0, 'l2'
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy, 5.0,
                                    'l2')
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    cls_nb = classify.train_classifier_2(sentiment.trainX, sentiment.trainy)

    cls_svm = classify.train_classifier_3(sentiment.trainX, sentiment.trainy)

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)
    print("Unlabeled data***", len(unlabeled.data))
    lab, unlab = add_unlabeled(unlabeled, cls, cls_nb, cls_svm, sentiment)
    print("Len labeled data: ", len(lab))
    test_acc, dev_acc = [], []
    lens_ = []
    val_10 = 9152
    for i in range(10):
Beispiel #19
0
            (label, review) = line.strip().split("\t")
            i += 1
            f.write(str(i))
            f.write(",")
            f.write("POSITIVE")
            f.write("\n")
    f.close()


if __name__ == "__main__":
    print("Reading data")
    tarfname = "data/sentiment.tar.gz"
    sentiment = read_files(tarfname)
    print("\nTraining classifier")
    import classify
    cls = classify.train_classifier(sentiment.trainX, sentiment.trainy)
    print("\nEvaluating")
    classify.evaluate(sentiment.trainX, sentiment.trainy, cls, 'train')
    classify.evaluate(sentiment.devX, sentiment.devy, cls, 'dev')

    print("\nReading unlabeled data")
    unlabeled = read_unlabeled(tarfname, sentiment)
    print("Writing predictions to a file")
    write_pred_kaggle_file(unlabeled, cls, "data/sentiment-pred.csv",
                           sentiment)
    #write_basic_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-basic.csv")

    # You can't run this since you do not have the true labels
    # print "Writing gold file"
    # write_gold_kaggle_file("data/sentiment-unlabeled.tsv", "data/sentiment-gold.csv")
Beispiel #20
0
        elif y == 'FALSE':
            X.append(x)
            Y.append(0)

    # The first 90% are train data, the last 10% are test data
    n = int(len(X) * .9)
    XY_train = list(zip(X, Y))[:n]
    XY_test = list(zip(X, Y))[n:]
    data_train, y_train = [x for x, y in XY_train], [y for x, y in XY_train]
    data_test, y_test = [x for x, y in XY_test], [y for x, y in XY_test]
    print("Train data has %d positive reviews" % y_train.count(1))
    print("Train data has %d negative reviews" % y_train.count(0))
    print("Test data has %d positive reviews" % y_test.count(1))
    print("Test data has %d negative reviews" % y_test.count(0))

    # Testing
    print("Testing CountVectorizer...")
    count_vect = CountVectorizer()
    count_vect.fit(data)
    X_train = count_vect.transform(data_train)
    X_test = count_vect.transform(data_test)
    cls = classify.train_classifier(X_train, y_train)
    classify.evaluate(X_test, y_test, cls, 'test')

    print("Testing TfidfVectorizer...")
    tfidf_vect = TfidfVectorizer()
    tfidf_vect.fit(data)
    X_train = tfidf_vect.transform(data_train)
    X_test = tfidf_vect.transform(data_test)
    cls = classify.train_classifier(X_train, y_train)
    classify.evaluate(X_test, y_test, cls, 'test')