Example #1
0
    def create(self):
        X_train_corpus = []
        y_labels = []
        for line in self.labeled:
            array = line.split(" ")
            if ("<negative>" in array):
                y_labels.append(-1)
            elif ("<positive>" in array):
                y_labels.append(1)
            elif ("<neutral>" in array):
                continue
            i = line.find('<')
            line = line[:i]
            X_train_corpus.append(line)

        X_test_corpus = self.unlabeled
        token = r"(?u)\b[\w\'/]+\b"
        tf_vectorizer = CountVectorizer(lowercase=True,
                                        max_df=1.0,
                                        min_df=1,
                                        binary=True,
                                        token_pattern=token)
        tf_vectorizer.set_params(ngram_range=(1, 1))
        X_labeled = tf_vectorizer.fit_transform(X_train_corpus)
        X_unlabeled = tf_vectorizer.transform(X_test_corpus)
        return X_labeled, y_labels, X_unlabeled, X_train_corpus, X_test_corpus
def extract_text_features(dataset_movieIDs):
    word_list_bag = set()

    for each_movie_id in dataset_movieIDs:
        #word_list_bag.append(movie_plot[each_movie_id].split())
        #tempList = set()
        tempList = []
        movie_words = []
        for eachWord in movie_plot[each_movie_id].split():
            #word_list_bag.append(eachWord)
            word_list_bag.add(eachWord)
            #tempList.add(eachWord)
            tempList.append(eachWord)
            """ tempList.add(movie_title[each_movie_id])
            for eachGenre in movie_genres[each_movie_id]:
                tempList.add(eachGenre)
            tempList.add(movie_year[each_movie_id]) """
        count_vect = CountVectorizer()
        tokenizer = TreebankWordTokenizer()
        count_vect.set_params(tokenizer=tokenizer.tokenize)
        count_vect.set_params(stop_words='english')
        #print(stop_words.ENGLISH_STOP_WORDS)
        count_vect.set_params(ngram_range=(1, 2))
        count_vect.set_params(max_df=0.5)
        count_vect.set_params(min_df=1)
        movie_words = count_vect.fit_transform(tempList)
        if each_movie_id not in movie_with_features:
            movie_with_features[each_movie_id] = [
                movie_words, movie_title[each_movie_id],
                movie_genres[each_movie_id], movie_year[each_movie_id]
            ]
            #movie_with_features[each_movie_id] = [movie_words, count_vect.fit_transform(movie_title[each_movie_id]), count_vect.fit_transform(movie_genres[each_movie_id]), count_vect.fit_transform(movie_year[each_movie_id])]
            movie_bag_of_words[each_movie_id] = movie_words
Example #3
0
 def set_params(self, **kwargs):
     CountVectorizer.set_params(self, **kwargs)
     CountVectorizer.__init__(self,
                              preprocessor=get_preprocessor(
                                  self.column, self.size, self.terminator),
                              ngram_range=(1, self.size),
                              analyzer='char',
                              binary=self.binary)
Example #4
0
 def set_params(self, **kwargs):
     CountVectorizer.set_params(self, **kwargs)
     CountVectorizer.__init__(self,
                              preprocessor=get_preprocessor(self.column,
                                                            self.size,
                                                            self.terminator),
                              ngram_range=(1, self.size),
                              analyzer='char',
                              binary=self.binary)
Example #5
0
def stratifiedSplitFixed(ngram_range=(1, 2), n_features=[85000]):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.pipeline import Pipeline

    ngramStartTime = time.time()
    result = []
    results_dataset = pandas.DataFrame()

    for n in n_features:
        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.2,
                                     random_state=3000)

        f1_scores = []

        for train_index, test_index in sss.split(x, y):
            cvec = CountVectorizer()
            cvec.set_params(stop_words=None,
                            max_features=n,
                            ngram_range=ngram_range,
                            encoding="utf-8")

            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = LogisticRegression(solver='liblinear', multi_class="auto")

            pipeline = Pipeline([('vectorizer', cvec), ('classifier', clf)])

            sentiment_fit = pipeline.fit(x_train, y_train)
            y_pred = sentiment_fit.predict(x_test)

            # conmat = np.array(confusion_matrix(y_test, y_pred, labels=[2.0, 3.0, 4.0]))

            calc_f1 = f1_score(y_test,
                               y_pred,
                               labels=[2.0, 3.0, 4.0],
                               average="micro")
            # print(calc_f1)
            f1_scores.append(calc_f1)

        averageF1 = np.sum(f1_scores) / len(f1_scores)
        # print("Avg")
        # print(averageF1)

        result.append((n, averageF1))
        results_dataset[n] = pandas.Series(f1_scores)
    # print(result)
    pathString = "./data/nonBinary" + str(ngram_range[1]) + "Grams.csv"
    results_dataset.to_csv(pathString, sep="\t")
    elapsed = time.time() - ngramStartTime
    print("time for " + str(ngram_range[1]) + ": " + str(elapsed))
    return result
Example #6
0
def amazon_data():
    path = r"..\..\data\reviews_Amazon_Instant_Video_5.json.gz"
    X, y = extract_review_amazon(path, 'reviewText')

    y_label = np.asarray(y)
    neutral_indices = np.where(y_label == 3)[0]
    y_label[y_label < 3] = 0
    y_label[y_label > 3] = 1

    X_discarded = np.delete(X, neutral_indices)
    y_discarded = np.delete(y_label, neutral_indices)

    del X
    del y_label

    # split
    print('Split train-test...')
    X_train_split, X_test_split, y_train, y_test = train_test_split(
        X_discarded, y_discarded, test_size=0.33, random_state=42)

    # preprocessing
    print('preprocess the data...')
    X_train_corpus_update = update_corpus_contraction(X_train_split)
    X_test_corpus_update = update_corpus_contraction(X_test_split)

    # count vectorizer

    print('perform count vectorizer...')
    token = r"(?u)\b[\w\'/]+\b"
    cv = CountVectorizer(lowercase=True,
                         max_df=1.0,
                         min_df=100,
                         binary=True,
                         token_pattern=token)
    cv.set_params(ngram_range=(1, 1))

    cv.fit(X_train_split)

    X_train = cv.transform(X_train_corpus_update)
    X_test = cv.transform(X_test_corpus_update)

    words = cv.get_feature_names()

    print('load word list...')
    word_list, connotation = load_unigrams('./amazon-video-unigrams.txt',
                                           X_train_corpus_update, y_train)

    print('Generate appearance agreement...')
    y_train_agreement, y_test_agreement = generate_appearance(
        X_train_corpus_update, X_test_corpus_update, word_list, connotation)

    return X_train, X_test, y_train_agreement, y_test_agreement, y_train, y_test
Example #7
0
def q04_count_vectors(path, ranges=(1, 2), max_df=0.5, min_df=2):
    data, X_train, X_test, y_train, y_test = q01_load_data(path)
    X_train = pd.Series(X_train)
    vect = CountVectorizer(decode_error='ignore')
    tokenizer = TreebankWordTokenizer()
    vect.set_params(tokenizer=tokenizer.tokenize,
                    stop_words='english',
                    ngram_range=ranges,
                    max_df=max_df,
                    min_df=min_df)
    train_transformed = vect.fit_transform(X_train)
    test_transformed = vect.transform(X_test)
    return train_transformed, test_transformed
Example #8
0
def train(dimension=yV):
    # Valence Train
    # Get rid of for loop
    cvec = CountVectorizer()
    cvec.set_params(stop_words=None, max_features=85000, ngram_range=(1, 2))
    clf = MultinomialNB()

    pipeline = make_pipeline(cvec, clf)

    x_train, x_test = x[trainInd], x[testInd]
    y_train, y_test = dimension[trainInd], dimension[testInd]

    sentiment_fit = pipeline.fit(x_train, y_train)
    return sentiment_fit
Example #9
0
def trainSentenceModel():
    from sklearn.linear_model import LogisticRegression
    from sklearn.feature_extraction.text import CountVectorizer
    from imblearn.pipeline import make_pipeline
    from imblearn.over_sampling import SMOTE


    cvec = CountVectorizer()
    cvec.set_params(stop_words=None, max_features=200000, ngram_range=(1,3))
    clf = LogisticRegression(solver='liblinear', multi_class="auto")
    SMOTE_pipeline = make_pipeline(cvec, SMOTE(random_state=SEED), clf)

    sentiment_fit = SMOTE_pipeline.fit(x_train, y_train)
    return sentiment_fit
def modelF1(ngram_range=(1, 2), n_features=85000, dimension="v"):
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import f1_score
    from imblearn.pipeline import make_pipeline

    FOLDS = 10

    sss = StratifiedShuffleSplit(n_splits=FOLDS,
                                 test_size=0.2,
                                 random_state=3000)
    result = []

    if (dimension == "v"):
        y = v
    elif dimension == "a":
        y = a
    else:
        y = d

    for train_index, test_index in sss.split(x, y):
        cvec = CountVectorizer()
        cvec.set_params(max_features=n_features, ngram_range=ngram_range)
        clf = MultinomialNB()

        pipeline = make_pipeline(cvec, clf)

        # X = cvec.fit_transform(x)
        x_train, x_test = x[train_index], x[test_index]
        # X_train, X_test = X[train_index], X[test_index]

        y_train, y_test = y[train_index], y[test_index]

        sentiment_fit = pipeline.fit(x_train, y_train)
        y_pred = sentiment_fit.predict(x_test)

        f1 = f1_score(y_test, y_pred, labels=[2.0, 3.0, 4.0], average="micro")
        result.append(f1)

    avgScore = 0
    for score in result:
        avgScore += score

    elapsedTime = time.time() - start_time
    print("elapsed time: " + str(elapsedTime))

    return avgScore / FOLDS
def sample(ngram_range=(1, 2), n_features=85000, methods=[]):
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import f1_score
    from imblearn.pipeline import make_pipeline

    methodsResults = pandas.DataFrame()
    counter = -2
    for osm in methods:
        counter += 1
        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.2,
                                     random_state=3000)
        # confusion_sum = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        result = []
        for train_index, test_index in sss.split(x, y):
            cvec = CountVectorizer()
            cvec.set_params(max_features=n_features, ngram_range=ngram_range)
            clf = MultinomialNB()

            if (osm == 0):
                pipeline = make_pipeline(cvec, clf)
            else:
                pipeline = make_pipeline(cvec, osm, clf)

            # X = cvec.fit_transform(x)
            x_train, x_test = x[train_index], x[test_index]
            # X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            sentiment_fit = pipeline.fit(x_train, y_train)
            y_pred = sentiment_fit.predict(x_test)

            # conmat = np.array(confusion_matrix(y_test, y_pred, labels=[2.0, 3.0, 4.0]))
            # print(conmat)
            f1 = f1_score(y_test,
                          y_pred,
                          labels=[2.0, 3.0, 4.0],
                          average="micro")
            result.append(f1)
        # print(result)
        if (osm == 0):
            methodsResults["Base Case"] = result
        else:
            methodsResults[type(osm).__name__] = result
        # print(confusion_sum)
    return methodsResults
Example #12
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words="english")
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words="_bad_str_stop_")
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words="_bad_unicode_stop_")
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ["some", "other", "words"]
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), stoplist)
Example #13
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), set(stoplist))
Example #14
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), stoplist)
Example #15
0
    def cosine_sim(self):
        '''
        Perform Cosine Similarity
        '''

        tokenizer = TreebankWordTokenizer()
        vect = CountVectorizer()
        vect.set_params(tokenizer=tokenizer.tokenize, stop_words='english')
        document = Document(self.doc)
        rels = document.part.rels

        corpus = []
        corpus.append(readFile(document))

        for link in onlRefs(rels):
            corpus.append(working_with_mySQL(readContent(link)))

        tfidf = vect.fit_transform(corpus)
        return (1 - cosine_similarity(tfidf)[0][1]) * 100
Example #16
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert cv.get_stop_words() == ENGLISH_STOP_WORDS
    cv.set_params(stop_words='_bad_str_stop_')
    with pytest.raises(ValueError):
        cv.get_stop_words()
    cv.set_params(stop_words='_bad_unicode_stop_')
    with pytest.raises(ValueError):
        cv.get_stop_words()
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert cv.get_stop_words() == set(stoplist)
Example #17
0
def get_bag_of_words(X, fit=True, col=None, vectorizer=None, **params):
    if col not in X.columns:
        logger.warning('attempted to compute bag of word features for non-existent column: %s' % col)
        return X, None

    if fit:
        vectorizer = CountVectorizer()
        transformer = TfidfTransformer(smooth_idf=True)

        # if not specified, setting default tokenizer to nltk.WordPunctTokenizer
        if 'tokenizer' not in params:
            vectorizer.set_params(**{'tokenizer': WordPunctTokenizer().tokenize})

        vectorizer.set_params(**params)
        texts = X[col].str.strip().fillna('')
        word_count_matrix = vectorizer.fit_transform(texts)
        vectorizer.set_params(**{'vocabulary': vectorizer.vocabulary_})
    else:
        if vectorizer is None:
            logger.warning('attempted to transform feature: %s with non-existent vectorizer' % col)
            return X, None
        texts = X[col].str.strip().fillna('')
        word_count_matrix = vectorizer.transform(texts)

    bag_of_words_feature_names = [col + '_w_' + feature_name for feature_name in vectorizer.get_feature_names()]
    bag_of_word_features_df = pd.DataFrame(word_count_matrix.todense(), columns=bag_of_words_feature_names, index=X.index.values)

    X = pd.merge(X, bag_of_word_features_df, right_index=True, left_index=True)
    transform_params = {'col': col, 'vectorizer': vectorizer}
    return X, transform_params
Example #18
0
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)

    # 返回的四个list的含义
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.3,
        random_state=10)

    print 'Feature selection...'
    print 'fs method:' + fs_method, ' fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary=True)

    # 切词并得到term set feature select
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]
    term_set_fs = feature_selection(doc_terms_list_train, doc_class_list_train,
                                    fs_method)[:fs_num]

    print 'Building VSM model...with'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))

    vectorizer.set_params(vocabulary=term_dict)

    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)

    clf = MultinomialNB().fit(doc_train_vec,
                              doc_class_list_train)  # 调用MultinomialNB分类器
    doc_test_predicted = clf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)
    print 'Accuracy: ', acc

    return acc
Example #19
0
    def create(self):
        """
        vectorizes the data
        
        Returns
        -------
        X_labeled: matrix of int
            vectorized labeled EDUs
        y_labels: list of int
            labels of labeled EDUs
        X_unlabeled: matrix of int
            vectorized unlabeled EDUs
        """
        X_train_corpus = []
        y_labels = []
        for line in self.labeled:
            # the last character is the label
            y_labels.append(self._labels[line[-2]])
            # get rid of the last character
            line = line[:-2]
            X_train_corpus.append(line)

        X_test_corpus = self.unlabeled

        # vectorize the corpus
        token = r"(?u)\b[\w\'/]+\b"
        tf_vectorizer = CountVectorizer(lowercase=True,
                                        max_df=1.0,
                                        min_df=1,
                                        binary=True,
                                        token_pattern=token)
        tf_vectorizer.set_params(ngram_range=(1, 1))

        X_labeled = tf_vectorizer.fit_transform(X_train_corpus)
        X_unlabeled = tf_vectorizer.transform(X_test_corpus)

        return X_labeled, y_labels, X_unlabeled
Example #20
0
    def cosine_sim(self):
        '''
        Perform Cosine Similarity
        '''

        tokenizer = TreebankWordTokenizer()
        vect = CountVectorizer()
        vect.set_params(tokenizer=tokenizer.tokenize, stop_words='english')
        doc = Document(self.document)
        rels = doc.part.rels
        db_links, db_contents = get_data()
        
        corpus = []
        corpus.append(readFile(doc))

        for link in onlRefs(rels):
            if not link in db_links:
                push_ref(link)
            
        for corpora in db_contents:
            corpus.append(corpora)

        tfidf = vect.fit_transform(corpus)
        return (1 - cosine_similarity(tfidf)[0][1])*100
Example #21
0
    def __init__(self):
        with open('newscore/data.pkl', 'rb') as infile:
            self.corpus = pickle.load(infile)
            self.nf = pickle.load(infile)
            self.no = pickle.load(infile)
            self.X = pickle.load(infile)

        vect = CountVectorizer()
        vect.set_params(tokenizer=self._tokenizeText)
        vect.set_params(ngram_range=(1, 1))
        vect.set_params(min_df=3)

        print('Loading corpus')
        self.vect = vect.fit(self.corpus)
        print('Done loading corpus')
Example #22
0
def overSampleBinary(ngram_range=(1, 3), n_features=[200000]):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.metrics import confusion_matrix
    from imblearn.pipeline import make_pipeline
    from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler

    result = []
    for n in n_features:
        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.2,
                                     random_state=3000)
        confusion_sum = [[0, 0], [0, 0]]

        for train_index, test_index in sss.split(x, y):
            cvec = CountVectorizer()
            SEED = 777
            cvec.set_params(stop_words=None,
                            max_features=n,
                            ngram_range=ngram_range)
            clf = LogisticRegression(solver='liblinear')

            SMOTE_pipeline = make_pipeline(cvec, SMOTE(random_state=SEED), clf)

            # X = cvec.fit_transform(x)
            x_train, x_test = x[train_index], x[test_index]
            # X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            sentiment_fit = SMOTE_pipeline.fit(x_train, y_train)
            y_pred = sentiment_fit.predict(x_test)

            conmat = np.array(
                confusion_matrix(y_test, y_pred, labels=[2.0, 3.0, 4.0]))

            confusion_sum = confusion_sum + conmat

        print(confusion_sum)

        def calculatePrecision(i):
            true_pos = confusion_sum[i][i]
            if (true_pos == 0):
                return 0
            else:
                sumRow = 0
                for j in confusion_sum[i]:
                    sumRow = sumRow + j
                precision = true_pos / sumRow
                return precision

        def calculateRecall(i):
            true_pos = confusion_sum[i][i]
            if (true_pos == 0):
                return 0
            else:
                sumCol = 0
                for j in confusion_sum:
                    sumCol = sumCol + j[i]
                recall = true_pos / sumCol
                return recall

        precision_recall = []
        f1_scores = []
        for i in range(len(confusion_sum)):
            precison = calculatePrecision(i)
            recall = calculateRecall(i)
            precision_recall.append([precison, recall])
            if (precison + recall) == 0:
                f1_scores.append(0)
            else:
                f1_scores.append(2 * (precison * recall) / (precison + recall))
        print(precision_recall)
        print(f1_scores)
        print("Average F1")
        print(avgIgnore0(f1_scores))
Example #23
0
        num = 0
        for w in text:
            if w == word:
                num += 1
        if num:
            vector[i] = num
    return vector


#
# for t in texts:
#     print(vectorize(t))
"""Используя алгоритмы вроде Вag of Words, мы теряем порядок слов в тексте. 
 Чтобы избежать этой проблемы, можно сделать шаг назад и изменить подход к токенизации:
  например, использовать N-граммы (комбинации из N последовательных терминов).
"""

vect = CountVectorizer(ngram_range=(1, 1))
print(vect.fit_transform(['no i have cows', 'i have no cows']).toarray())
# [[1 1 1]
#  [1 1 1]]
print(vect.vocabulary_)
# {'no': 2, 'have': 1, 'cows': 0}

vect.set_params(ngram_range=(1, 2))
print(vect.fit_transform(['no i have cows', 'i have no cows']).toarray())
# [[1 1 1 0 1 0 1]
#  [1 1 0 1 1 1 0]]
print(vect.vocabulary_)
# {'no': 4, 'have': 1, 'cows': 0, 'no have': 6, 'have cows': 2, 'have no': 3, 'no cows': 5}
def stratifiedSplitFixed():
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import accuracy_score, confusion_matrix
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.pipeline import Pipeline
    from scipy import stats

    x = emoBank.sentence
    y = emoBank.Valence

    result = []
    myTest = 0
    count = 0

    sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=3000)
    confusion_sum = [[0,0],[0,0]]

    for train_index, test_index in sss.split(x, y):
        cvec = CountVectorizer()
        cvec.set_params(stop_words=None, max_features=200000, ngram_range=(1,3))
        # X = cvec.fit_transform(x)
        x_train, x_test = x[train_index], x[test_index]
        # X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        clf = LogisticRegression(solver='liblinear')

        pipeline = Pipeline([
            ('vectorizer', cvec),
            ('classifier', clf)
        ])

        sentiment_fit = pipeline.fit(x_train, y_train)
        predictions = sentiment_fit.predict(x_test)

        newX = pandas.DataFrame({"Constant": np.ones(len(x_test))}).join(pandas.DataFrame(x_test))
        # print(y_test - predictions)
        # print(sum((y_test - predictions) ** 2))
        # print((len(newX) - len(newX.columns)))
        MSE = (sum((y_test - predictions) ** 2)) / (len(newX) - len(newX.columns))

        print(MSE)
        print(newX)
        print(np.dot(newX.T, newX))
        print(np.linalg.inv(np.dot(newX.T, newX)).diagonal())
        var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal())
        sd_b = np.sqrt(var_b)
        ts_b = params / sd_b

        p_values = [2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b]

        sd_b = np.round(sd_b, 3)
        ts_b = np.round(ts_b, 3)
        p_values = np.round(p_values, 3)
        params = np.round(params, 4)

        myDF3 = pandas.DataFrame()
        myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3["Probabilites"] = [params, sd_b, ts_b,
                                                                                                     p_values]
        print(myDF3)

    return sentiment_fit
Example #25
0
class SubSpaceEnsemble3(BaseEstimator, TransformerMixin):
    
    """ Utilizing the neighborhood in all representations and also ground truth model.
        Implementing a weighted voting scheme."""

    def __init__(self, models, k=3, weights= [2,1,3,0.7]):
        from sklearn.feature_extraction.text import CountVectorizer
        
        if (not models):
            raise AttributeError('Models expexts a dictonary of models \
              containg the predictions of y_true for each classifier.\ ')
        else:
            self.models = models
            # self.cv_scores = cv_scores
            self.k = k
            self.weights = weights
            self.ind2names = {}
            for i, name in enumerate(models.keys()):
                self.ind2names[i] = name
            self.counter = CountVectorizer()
            self.representations = []
            self.meta = None
            self.predictions = []
            self.true = []
            self.doc_terms = None
            self.tree = None
            self.experts = []
        

    def fit(self, X_cv, y_true=None, weights=None):
        
        from sklearn.neighbors import BallTree
        import random

        if y_true is None:
            raise ValueError('we need y labels to supervise-fit!')
        else:
            parameters = {
                    'input': 'content',
                    'encoding': 'utf-8',
                    'decode_error': 'ignore',
                    'analyzer': 'word',
                    'stop_words': 'english',
                    # 'vocabulary':list(voc),
                    #'tokenizer': tokenization,
                    #'tokenizer': _twokenize.tokenizeRawTweetText,  # self.tokenization,
                    #'tokenizer': lambda text: _twokenize.tokenizeRawTweetText(nonan.sub(po_re.sub('', text))),
                    'max_df': 1.0,
                    'min_df': 1,
                    'max_features':None
                }
            t0 = time.time()
            self.counter.set_params(**parameters)
            self.doc_terms = self.counter.fit_transform(X_cv).toarray()
            self.tree = BallTree(self.doc_terms, leaf_size=20)
            predictions = []
            for name, model in self.models.iteritems():
                predictions.append(model.predict(X_cv))
                #print len(predictions[-1])
                transf = model.steps[0][1].transform(X_cv)
                if hasattr(transf, "toarray"):
                    #print 'Exei'
                    self.representations.append(transf.toarray())
                else:
                    self.representations.append(transf)
            self.predictions = predictions
            self.true = y_true
            count = 0
            #print self.expert_scores
            #print self.experts
            print('Fit took: %0.3f seconds') % (time.time()-t0)
            return self

    def predict(self, X):
        # print "PRedict"
        # print X.shape
        X_transformed = self.counter.transform(X).toarray()
        #print type((X_transformed)[0])
        #print X_transformed.shape
        #return 0
        y_pred = []
        t0 = time.time()
        for i in range(0, X_transformed.shape[0]):
            #print X_transformed[i,:].shape
            dist, neigbors_indexes = self.tree.query(X_transformed[i,:].reshape(1,-1), self.k)  
            #print 'Sample ' + y_real[i]
            #print dist
            #print type(dist)
            #print neigbors_indexes[0]
            #print dist
            #best_model_ind = self.expert_decision(neigbors_indexes[0])
            #pass
            y_pred.append(self.expert_decision(neigbors_indexes[0],  dist, X[i]))
            
            #y_pred.append(self.models[self.ind2names[best_model_ind]].predict([X[i]])[0])
        #print y_pred
        print('Predict took: %0.3f seconds') % (time.time()-t0)
        return y_pred

    def score(self, X, y, sample_weight=None):

        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X), normalize=True)
        #return self.svc.score(self.transform_to_y(X), y, sample_weight)


    def expert_decision(self, neigbors_indexes, dist, x_sample):

        from sklearn.metrics import accuracy_score
        from collections import Counter
        from sklearn.neighbors import BallTree
        
        models_pred = []
        models_neig_pred = []
        acc = []
        t0 = time.time()
        neigbors_true = [self.true[n_i] for n_i in neigbors_indexes]
        #print('Neighbors per sample: %0.4f seconds') % (time.time()-t0)
#         print 'True'
#         print neigbors_true
        sample_predictions = []
        total_pred = []
        weights = {}
        weights['true'] = self.weights[2]
        weights['models_n'] = []
        weights['models'] = []
        for model_i in xrange(len(self.models.values())):
            ModelTree = BallTree(self.representations[model_i])
            temp_trans = self.models[self.ind2names[model_i]].steps[0][1].transform([x_sample])
            if hasattr(temp_trans, 'toarray'):
                temp_trans = temp_trans.toarray()
            _, model_neig = ModelTree.query(temp_trans, self.k)
            model_neig_pred = []
            for model_n_i in model_neig[0].tolist():
                model_neig_pred.append(self.predictions[model_i][model_n_i])
            models_neig_pred.append(model_neig_pred)
            model_pred = []
            for n_i in neigbors_indexes:
                model_pred.append(self.predictions[model_i][n_i])
            models_pred.append(model_pred)
            acc.append(accuracy_score(neigbors_true, model_neig_pred, normalize=True))
            if acc[-1] >self.weights[3]:
                # Adding neighbors predictions
                weights['models_n'].append(int(self.weights[1]/float((1-acc[-1])+0.01)))
                total_pred.extend([pred for j in xrange(weights['models_n'][-1]) for pred in model_pred])
                #print('Predicting Neighbors per sample: %0.4f seconds') % (time.time()-t0)
                # Adding sample prediction
                sample_predictions.append(self.models[self.ind2names[model_i]].predict(x_sample)[0])
                weights['models'].append(int(self.weights[0]/float((1-acc[-1])+0.01))) 
                total_pred.extend([sample_predictions[-1] for j in xrange(weights['models'][-1])])
                total_pred.extend([pred for j in xrange(weights['models'][-1]) for pred in model_neig_pred])
            #print len(x_sample)
            #print self.ind2names[model_i]
            
#                 print 'Model: ' + self.ind2names[model_i] + ' Accuracy: ' + str(accuracy_score(neigbors_true, model_neig_pred, normalize=True))
#                 print 'Predictions'
#                 print model_pred
#                 print 'Representations'
#                 print model_neig_pred
#                 print 'Sample prediction: ' + str(sample_predictions[-1])
        total_pred.extend([n for i, n in enumerate(neigbors_true) for j in xrange(int(weights['true']*(self.k-i)))])
        #print('creating votes: %0.4f seconds') % (time.time()-t0)
        data = Counter(total_pred)
        #data = Counter([k for pred in models_pred for k in pred])
#         print data
#         best_model_ind = acc.index(max(acc))
#         print 'Total pred: ' + str(data.most_common(1)[0][0])
#         print '='*50
        #print len(total_pred)
        #return best_model_ind
        return data.most_common(1)[0][0]
Example #26
0
x_test = test_data_frame.text.astype('U')

#Regressiontype
#linear_classifier = SGDClassifier()
linear_classifier = LogisticRegression()
#linear_classifier = Ridge()

#Create count vectorizer and fit it to data
count_vectorizer = CountVectorizer()
count_vectorizer.fit(pd.Series.append(x_train, x_test))

#n_gram_range = (1, 1)
#n_gram_range = (1, 2)
n_gram_range = (1, 3)
maximum_number_of_words = 100000
count_vectorizer.set_params(max_features=maximum_number_of_words, ngram_range=n_gram_range)

print("Training model...")
pipeline = Pipeline([('vectorizer', count_vectorizer),('classifier', linear_classifier)])
sentiment_fit = pipeline.fit(x_train, y_train)

print("Predicting sentiments...")
y_pred = sentiment_fit.predict(x_test)
y_pred[y_pred==0] = -1

ids = np.linspace(1, len(y_pred), num=len(y_pred),dtype=int)
with open('predictions.csv', 'w') as csvfile:
    fieldnames = ['Id', 'Prediction']
    writer = CSV.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
    writer.writeheader()
    for r1, r2 in zip(ids, y_pred):
Example #27
0
def vectorizeSummaries():
	"""Create word count matrix from game summaries."""
	with open(cfg.configPath() + '/Vocabulary.txt', 'r') as inFile:
		vocab = [x.rstrip() for x in inFile]
	config = cfg.readConfig()
	params = config['Text']
	vectorizer = CountVectorizer(**params)
	vectorizer.set_params(vocabulary=vocab)
	with open(cfg.databasePath() + '/Sets.json', 'r') as setsFile:
		sets = json.load(setsFile)
	ids = []
	summaries = []
	for id in sets['train']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	vectorizer.fit(summaries)
	train = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	train[:, 0] = ids
	train[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/train_text.csv', train, fmt='%d')
	ids = []
	summaries = []
	for id in sets['valid']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	valid = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	valid[:, 0] = ids
	valid[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/valid_text.csv', valid, fmt='%d')
	ids = []
	summaries = []
	for id in sets['test']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	test[:, 0] = ids
	test[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/test_text.csv', test, fmt='%d')
	ids = []
	summaries = []
	for id in sets['rank_train']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	test[:, 0] = ids
	test[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/rank_train_text.csv', test, fmt='%d')
	ids = []
	summaries = []
	for id in sets['rank_test']:
		with open(cfg.databasePath() + '/Games/{}.json'.format(id), 'r') as inFile:
			gameData = json.load(inFile)
		if 'summary' in gameData:
			ids.append(id)
			summaries.append(gameData['summary'])
	test = np.zeros((len(ids), 1 + len(vectorizer.get_feature_names())))
	test[:, 0] = ids
	test[:, 1:] = vectorizer.transform(summaries).todense()
	np.savetxt(cfg.databasePath() + '/rank_test_text.csv', test, fmt='%d')
class Neigbors_DS(BaseEstimator, TransformerMixin):
    
    """ Best model base on the predictions of the k-nearest neighbors. Many different schemes.
        Also, implements a common neighborhoud instead a per transformation one.
        
        Args:
            - scheme: String flag. Can be one of the following:
                - 'LCA': Local Class Accuracy
                - 'OLA': Overall Local Accuracy
                - 'KNE': K_Neighbors Elimination. Start from a k 
                - 'optimal': The optimal weights are found, this
                             is done by optimizing over the classification
                             error
                - weights: list or numpy.array(not sure?) containing as many
                             weights as the models in the ensemble
        Returns:
            - The  ensemble Model. Needs to be fitted for the encoding part
        
        """

    def __init__(self, models, models_tr, k= 5, scheme='LCA', common_neigh=False):

        if (not models) or (not models_tr):
            raise AttributeError('Models expexts a dictonary of models \
              containg the predictions of y_true for each classifier.\
              cv_score expects a list len(models.keys()) with the\
              cross validation scores of each model')
        else:
            self.models = models
            self.models_tr = models_tr
            self.k = k
            self.ind2names = {}
            for i, name in enumerate(models.keys()):
                self.ind2names[i] = name
            self.predictions = {}
            self.true = []
            self.trees = {}
            self.scheme = scheme
            self.common_neigh = common_neigh
            if common_neigh:
                from sklearn.feature_extraction.text import CountVectorizer
        
                self.counter = CountVectorizer()
                parameters = {
                        'input': 'content',
                        'encoding': 'utf-8',
                        'decode_error': 'ignore',
                        'analyzer': 'word',
                        'stop_words': 'english',
                        # 'vocabulary':list(voc),
                        #'tokenizer': tokenization,
                        #'tokenizer': _twokenize.tokenizeRawTweetText,  # self.tokenization,
                        #'tokenizer': lambda text: _twokenize.tokenizeRawTweetText(nonan.sub(po_re.sub('', text))),
                        'max_df': 1.0,
                        'min_df': 1,
                        'max_features':None
                    }
                self.counter.set_params(**parameters)
                self.gt_tree = None
            else:
                self.counter = None
            if self.scheme == 'LCA':
                self.predictor = self.predict_lca
            elif self.scheme == 'KNE':
                self.predictor = self.predict_kne
            elif self.scheme == 'OLA':
                self.predictor = self.predict_ola
            elif self.scheme == 'KNU':
                self.predictor = self.predict_knu
            else:
                self.predictor = self.predict_ola
                
    def fit(self, X_cv, y_true=None, weights=None):
        from sklearn.neighbors import BallTree
        from sklearn.metrics import accuracy_score
        import random
        import time

        if y_true is None:
            raise ValueError('we need y labels to supervise-fit!')
        else:
            t0 = time.time()
            predictions = []
            for name, model in self.models.iteritems():
                #predictions.append(model.predict(X_cv))
                # print len(predictions[-1])
                if self.common_neigh:
                    X_tr = self.counter.fit_transform(X_cv)
                    self.gt_tree = BallTree(X_tr.toarray(), leaf_size=20)
                else:
                    X_tr = self.models_tr[name].transform(X_cv)
                    if hasattr(X_tr, "toarray"):
                        self.trees[name] = BallTree(X_tr.toarray(), leaf_size=20)
                    else:
                        self.trees[name] = BallTree(X_tr, leaf_size=20)    
                self.predictions[name] = model.predict(X_cv)
            self.true = y_true
            print 'Fitting time %0.2f' % (time.time() - t0)

    def predict(self, X):
        # import time

        # print "PRedict"
        # print X.shape
        y_pred = []
        # t0 = time.time()
        for i, x in enumerate(X):
            # print 'True Sample: ' + y_real[i]
            y_pred.append(self.predictor(x))
        # print('Predict took: %0.3f seconds') % (time.time()-t0)
        return y_pred

    def score(self, X, y, sample_weight=None):

        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X), normalize=True)
        # return self.svc.score(self.transform_to_y(X), y, sample_weight)


    def predict_lca(self, sample):
        preds = []
        for name, model in self.models.iteritems():
            preds.append(model.predict([sample])[0])
#         print 'Preds: ' + str(preds)
        if len(set(preds))==1:
#             print 'Unanimous Decision: ' + str(preds[0])
#             print '='*50
            return preds[0]
        else:
            lca = [0 for pred in preds]
            model_ind = 0
            for name, model in self.models.iteritems():
                # print 'Model: ' + name
                sample_trans = self.models_tr[name].transform([sample])
                step = 50
                found_k_class_n = self.k
                neigh_indexes = []
                while found_k_class_n>=0:
                    if self.common_neigh:
                        _, model_neig = self.gt_tree.query(self.counter.transform([sample]).toarray(), step)
                    else:
                        if hasattr(sample_trans, "toarray"):
                            _, model_neig = self.trees[name].query(sample_trans.toarray(), step)
                        else:
                            _, model_neig = self.trees[name].query(sample_trans, step)
                    for model_n_i in model_neig[0].tolist():
                        if name == 'lsi':
                            if self.true[model_n_i] != '35-49':
                                pass
                                # print 'GG'
                        if preds[model_ind] == self.true[model_n_i]:
                            neigh_indexes.append(model_n_i)
                            found_k_class_n -= 1
                    step *= 2
                    if step >= len(self.predictions[name]):
                        step = len(self.predictions[name])-1
                neigh_indexes = neigh_indexes[:self.k] 
                model_neig_pred = []
                neigh_true = []
                for model_n_i in neigh_indexes:
                    model_neig_pred.append(self.predictions[name][model_n_i])
                    neigh_true.append(self.true[model_n_i])
                lca[model_ind] = accuracy_score(neigh_true, model_neig_pred, normalize=True)
#                 print 'True Neigh: ' + str(neigh_true)
#                 print 'Predicted Neigh: ' + str(model_neig_pred)
                
                model_ind += 1
#             print 'LCA: %s' % str(['%0.2f' % (100*k) for k in lca])
#             print "Total Predicted: %s from model %s" % (str(preds[lca.index(max(lca))]), self.models.keys()[lca.index(max(lca))])
#             print '='*50
            return preds[lca.index(max(lca))]


    def predict_ola(self, sample):
        preds = []
        for name, model in self.models.iteritems():
            preds.append(model.predict([sample])[0])
#         print 'Preds: ' + str(preds)
        if len(set(preds))==1:
#             print 'Unanimous Decision: ' + str(preds[0])
#             print '='*50
            return preds[0]
        else:
            ola = [0 for pred in preds]
            model_ind = 0
            for name, model in self.models.iteritems():
#                 print 'Model: ' + name
                if self.common_neigh:
                    _, model_neig = self.gt_tree.query(self.counter.transform([sample]).toarray(), self.k)
                else:
                    sample_trans = self.models_tr[name].transform([sample])
                    if hasattr(sample_trans, "toarray"):
                        _, model_neig = self.trees[name].query(sample_trans.toarray(), self.k)
                    else:
                        _, model_neig = self.trees[name].query(sample_trans, self.k)
                model_neig_pred = []
                neigh_true = []
                for model_n_i in model_neig[0].tolist():
                    model_neig_pred.append(self.predictions[name][model_n_i])
                    neigh_true.append(self.true[model_n_i])
                ola[model_ind] = accuracy_score(neigh_true, model_neig_pred, normalize=True)
#                 print 'True Neigh: ' + str(neigh_true)
#                 print 'Predicted Neigh: ' + str(model_neig_pred)
#                 print 'OLA: %s' % str(['%0.2f' % (100*k) for k in ola])
                model_ind += 1
            
#             print "Total Predicted: %s from model %s" % (str(preds[ola.index(max(ola))]), self.models.keys()[ola.index(max(ola))])
#             print '='*50
            return preds[ola.index(max(ola))]

    def predict_kne(self, sample):
        preds = []
        for name, model in self.models.iteritems():
            preds.append(model.predict([sample])[0])
#         print 'Preds: ' + str(preds)
        if len(set(preds))==1:
#             print 'Unanimous Decision: ' + str(preds[0])
#             print '='*50
            return preds[0]
        else:
            k = self.k
            possible_experts = []
            neigh_radius = []
            ola_scores = []
            while k>0 :
                model_ind = 0
                # print k
                for name, model in self.models.iteritems():
#                     print 'Model: ' + name
                    if self.common_neigh:
                        _, model_neig = self.gt_tree.query(self.counter.transform([sample]).toarray(), k)
                    else:
                        sample_trans = self.models_tr[name].transform([sample])
                        if hasattr(sample_trans, "toarray"):
                            _, model_neig = self.trees[name].query(sample_trans.toarray(), k)
                        else:
                            _, model_neig = self.trees[name].query(sample_trans, k)
                    model_neig_pred = []
                    neigh_true = []
                    for model_n_i in model_neig[0].tolist():
                        model_neig_pred.append(self.predictions[name][model_n_i])
                        neigh_true.append(self.true[model_n_i])
#                     print 'True Neigh: ' + str(neigh_true)
#                     print 'Predicted Neigh: ' + str(model_neig_pred)
                    if k == self.k:
                        ola_scores.append(accuracy_score(neigh_true, model_neig_pred, normalize=True))
                    if neigh_true == model_neig_pred:
                        possible_experts.append(preds[model_ind])
                        neigh_radius.append(k)
                    model_ind += 1
                if not(possible_experts):
                    k -= 1
                else:
                    break
            if not(possible_experts):
#                 print 'No experts'
#                 print 'OLA_Scores: %s' % str(['%0.2f' % (100*k) for k in ola_scores])
#                 print preds[ola_scores.index(max(ola_scores))]
                return preds[ola_scores.index(max(ola_scores))]
            else:
#                 print 'Experts:'
#                 print possible_experts
#                 print neigh_radius
                return possible_experts[0]
            
     
    def predict_knu(self, sample):
        

        preds = []
        for name, model in self.models.iteritems():
            preds.append(model.predict([sample])[0])
        #print 'Preds: ' + str(preds)
        if len(set(preds))==1:
#             print 'Unanimous Decision: ' + str(preds[0])
#             print '='*50
            return preds[0]
        else:
            possible_experts = []
            neigh_radius = []
            ola_scores = []
            model_ind = 0
            for name, model in self.models.iteritems():
#                 print 'Model: ' + name
                if self.common_neigh:
                    _, model_neig = self.gt_tree.query(self.counter.transform([sample]).toarray(), self.k)
                else:
                    sample_trans = self.models_tr[name].transform([sample])
                    if hasattr(sample_trans, "toarray"):
                        _, model_neig = self.trees[name].query(sample_trans.toarray(), self.k)
                    else:
                        _, model_neig = self.trees[name].query(sample_trans, self.k)
                model_neig_pred = []
                neigh_true = []
                for model_n_i in model_neig[0].tolist():
                    model_neig_pred.append(self.predictions[name][model_n_i])
                    neigh_true.append(self.true[model_n_i])
                    if model_neig_pred[-1] == neigh_true[-1]:
                        possible_experts.append(preds[model_ind])
                ola_scores.append(accuracy_score(neigh_true, model_neig_pred, normalize=True))
#                 print 'True Neigh: ' + str(neigh_true)
#                 print 'Predicted Neigh: ' + str(model_neig_pred)
        if not(possible_experts):
#             print 'No experts'
#             print 'OLA_Scores: %s' % str(['%0.2f' % (100*k) for k in ola_scores])
#             print preds[ola_scores.index(max(ola_scores))]
            return preds[ola_scores.index(max(ola_scores))]
        else:
#             print 'Experts:'
#             print possible_experts
#             print most_common(possible_experts)
            return most_common(possible_experts)
print('List of newsgroup categories:')
print(list(newsgroups.target_names))
print("\n")

#print first line of the first data file, show this first time
#print('Sample file')
#print("\n".join(newsgroups.data[2].split("\n")[:10]))
#print("\n")

# convert collection of documents to matrix of string (word) counts
count_vect = CountVectorizer()

# use regular expressions to convert text to tokens
# split contractions, separate punctuation
tokenizer = TreebankWordTokenizer()
count_vect.set_params(tokenizer=tokenizer.tokenize)

# remove English stop words (try with and without this)
count_vect.set_params(stop_words='english')
print(stop_words.ENGLISH_STOP_WORDS)

# include 1-grams and 2-grams
count_vect.set_params(ngram_range=(1, 2))

# ignore terms that appear in >50% of the documents (try with and without this)
count_vect.set_params(max_df=0.5)

# ignore terms that appear in only 1 document (try with and without this)
count_vect.set_params(min_df=2)

# transform text to bag of words vector using parameters
Example #30
0
def stratifiedSplitFixed(ngram_range=(1, 3), n_features=[200000]):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.pipeline import Pipeline

    result = []
    for n in n_features:
        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.2,
                                     random_state=4000)
        confusion_sum = [[0, 0], [0, 0]]

        for train_index, test_index in sss.split(x, y):
            cvec = CountVectorizer()
            cvec.set_params(stop_words=None,
                            max_features=n,
                            ngram_range=ngram_range)
            # X = cvec.fit_transform(x)
            # print(train_index)
            y_train, y_test = y[train_index], y[test_index]
            x_train, x_test = x[train_index], x[test_index]
            # X_train, X_test = X[train_index], X[test_index]

            clf = LogisticRegression(solver='liblinear')

            pipeline = Pipeline([('vectorizer', cvec), ('classifier', clf)])

            sentiment_fit = pipeline.fit(x_train, y_train)
            y_pred = sentiment_fit.predict(x_test)

            # print("Im so sad this is actually very upsetting I hate my life")
            # print(x_test.shape)

            acc = accuracy_score(y_test, y_pred)

            conmat = np.array(confusion_matrix(y_test, y_pred, labels=[0, 1]))
            confusion_sum = confusion_sum + conmat

            # print("Classification Report\n")
            # print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

            # print(confusion)

        confusion = pandas.DataFrame(
            confusion_sum,
            index=['negative', 'positive'],
            columns=['predicted_negative', 'predicted_positive'])
        true_neg = confusion_sum[0][0]
        false_pos = confusion_sum[0][1]
        false_neg = confusion_sum[1][0]
        true_pos = confusion_sum[1][1]
        # print(confusion)

        precision = (true_pos) / (true_pos + false_pos)
        recall = (true_pos) / (true_pos + false_neg)

        f1 = 2 * (precision * recall) / (precision + recall)
        print(f1)
        result.append((n, f1))

    # print(result)
    return result
Example #31
0
# Print 50 most and least commong words
print(most_common)
print(least_common[0:50])

import numpy as np

# Create list of accuracies and different max_dfs to measure
tpAccuracy = []
count = 1
max_df = np.arange(0.01, 0.99, 0.01)

# Loop through all different max_dfs and add the respective accuracy to the list
for i in max_df:
    count_vectorizer = CountVectorizer()
    count_vectorizer = count_vectorizer.set_params(max_df = i, min_df = 0)
    emails_vector2 = count_vectorizer.fit_transform(emails)

    X_train2, X_test2, y_train2, y_test2 = train_test_split(emails_vector2, email_labels, random_state=1, test_size=0.25, stratify = email_labels)

    transform(y_train2,y_test2)
    
    mnb = MultinomialNB()

    mnb.fit(X_train2, y_train2)

    y_pred = mnb.predict(X_test2)

    tn, fp, fn, tp = confusion_matrix(y_test2,y_pred).ravel()

    tpAccuracy.append(tn/(fp+tn))
Example #32
0
                  value=0,
                  padding='post',
                  maxlen=max_word_count))

from sklearn.metrics import classification_report, confusion_matrix

# Confusion matrix
pred = model.predict(tx_test)
binpred = [0 if n <= 0.5 else 1 for n in pred]
conmat = np.array(confusion_matrix(y_test, binpred, labels=[1, 0]))
confusion = pd.DataFrame(conmat,
                         index=['positive', 'negative'],
                         columns=['predicted_positive', 'predicted_negative'])
print(confusion)

###

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

cvec = CountVectorizer()
lr = LogisticRegression()
cvec.set_params(max_features=200000, ngram_range=(1, 3))
checker_pipeline = Pipeline([('vectorizer', cvec), ('classifier', lr)])

sentiment_fit = checker_pipeline.fit(x_train, y_train)
y_pred = sentiment_fit.predict(x_test)
accuracy_score(y_test, y_pred)
Example #33
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert counts_train[0, v1.vocabulary_["pizza"]] == 2

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert counts_test[0, vocabulary["salad"]] == 1
        assert counts_test[0, vocabulary["tomato"]] == 1
        assert counts_test[0, vocabulary["water"]] == 1

        # stop word from the fixed list
        assert "the" not in vocabulary

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert "copyright" not in vocabulary

        # not present in the sample
        assert counts_test[0, vocabulary["coke"]] == 0
        assert counts_test[0, vocabulary["burger"]] == 0
        assert counts_test[0, vocabulary["beer"]] == 0
        assert counts_test[0, vocabulary["pizza"]] == 0

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert len(t1.idf_) == len(v1.vocabulary_)
    assert tfidf.shape == (n_train, len(v1.vocabulary_))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert not hasattr(t2, "idf_")

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    with pytest.raises(ValueError):
        t3.transform(counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    with pytest.raises(ValueError):
        t3.transform(X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert not tv.fixed_vocabulary_
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    with pytest.raises(ValueError):
        v3.transform(train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    processor = v3.build_preprocessor()
    text = ("J'ai mangé du kangourou  ce midi, "
            "c'était pas très bon.")
    expected = strip_accents_ascii(text)
    result = processor(text)
    assert expected == result

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    with pytest.raises(ValueError):
        v3.build_preprocessor()

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    with pytest.raises(ValueError):
        v3.build_analyzer()
Example #34
0
def stratifiedSplitFixed3Class(ngram_range=(1, 3), n_features=[200000]):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.pipeline import Pipeline

    result = []
    for n in n_features:
        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.2,
                                     random_state=3000)
        confusion_sum = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]

        for train_index, test_index in sss.split(x, y):
            cvec = CountVectorizer()
            cvec.set_params(stop_words=None,
                            max_features=n,
                            ngram_range=ngram_range,
                            encoding="utf-8")
            # X = cvec.fit_transform(x)
            x_train, x_test = x[train_index], x[test_index]
            # X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            clf = LogisticRegression(solver='liblinear')

            pipeline = Pipeline([('vectorizer', cvec), ('classifier', clf)])

            sentiment_fit = pipeline.fit(x_train, y_train)
            y_pred = sentiment_fit.predict(x_test)

            # print(x_test.shape)

            acc = accuracy_score(y_test, y_pred)
            # print(acc)
            # print(y_test)

            conmat = np.array(
                confusion_matrix(y_test, y_pred, labels=[2.0, 3.0, 4.0]))
            # print(conmat)
            confusion_sum = confusion_sum + conmat

            # print("Classification Report\n")
            # print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))

            # print(confusion)

        print(confusion_sum)

        def calculatePrecision(i):
            true_pos = confusion_sum[i][i]
            if (true_pos == 0):
                return 0
            else:
                sumRow = 0
                for j in confusion_sum[i]:
                    sumRow = sumRow + j
                precision = true_pos / sumRow
                return precision

        def calculateRecall(i):
            true_pos = confusion_sum[i][i]
            if (true_pos == 0):
                return 0
            else:
                sumCol = 0
                for j in confusion_sum:
                    sumCol = sumCol + j[i]
                recall = true_pos / sumCol
                return recall

        precision_recall = []
        f1_scores = []
        for i in range(len(confusion_sum)):
            precison = calculatePrecision(i)
            recall = calculateRecall(i)
            precision_recall.append([precison, recall])
            if (precison + recall) == 0:
                f1_scores.append(0)
            else:
                f1_scores.append(2 * (precison * recall) / (precison + recall))
        print(precision_recall)
        print(f1_scores)
        print("Average F1")
        print(avgIgnore0(f1_scores))
Example #35
0
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary_["pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary_)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        vocabulary = v.vocabulary_
        assert_equal(counts_test[0, vocabulary["salad"]], 1)
        assert_equal(counts_test[0, vocabulary["tomato"]], 1)
        assert_equal(counts_test[0, vocabulary["water"]], 1)

        # stop word from the fixed list
        assert_false("the" in vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false("copyright" in vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, vocabulary["coke"]], 0)
        assert_equal(counts_test[0, vocabulary["burger"]], 0)
        assert_equal(counts_test[0, vocabulary["beer"]], 0)
        assert_equal(counts_test[0, vocabulary["pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = t1.fit(counts_train).transform(counts_train).toarray()
    assert_equal(len(t1.idf_), len(v1.vocabulary_))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary_)))

    # test tf-idf with new data
    tfidf_test = t1.transform(counts_test).toarray()
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary_)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = t2.fit(counts_train).transform(counts_train).toarray()
    assert_equal(t2.idf_, None)

    # test idf transform with unlearned idf vector
    t3 = TfidfTransformer(use_idf=True)
    assert_raises(ValueError, t3.transform, counts_train)

    # test idf transform with incompatible n_features
    X = [[1, 1, 5],
         [1, 1, 0]]
    t3.fit(X)
    X_incompt = [[1, 3],
                 [1, 3]]
    assert_raises(ValueError, t3.transform, X_incompt)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = TfidfVectorizer(norm='l1')
    assert_false(tv.fixed_vocabulary)

    tv.max_df = v1.max_df
    tfidf2 = tv.fit_transform(train_data).toarray()
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = tv.transform(test_data).toarray()
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test transform on unfitted vectorizer with empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)

    # ascii preprocessor?
    v3.set_params(strip_accents='ascii', lowercase=False)
    assert_equal(v3.build_preprocessor(), strip_accents_ascii)

    # error on bad strip_accents param
    v3.set_params(strip_accents='_gabbledegook_', preprocessor=None)
    assert_raises(ValueError, v3.build_preprocessor)

    # error with bad analyzer type
    v3.set_params = '_invalid_analyzer_type_'
    assert_raises(ValueError, v3.build_analyzer)
Example #36
0
def main(*args):

    # load stop words
    stop_words = get_stop_words()

    plot = const.PLOT_DEFAULT
    print_ = const.PRINT_DEFAULT
    max_features = None
    random_state = const.RANDOM_STATE_DEFAULT
    order = -1  # default descending order
    wordcloud_n = None
    wordcloud_ = False
    cos_sim = False
    even_distrib = const.EVEN_DISTRIB_DEFAULT
    plt.rcParams.update({'font.size': const.FONT_SIZE_DEFAULT})
    pre_vec = False
    limit_size = False
    min_df = 1
    max_df = 1.0
    param_compare = False

    # print command line arguments
    for arg in args:
        k = arg.split("=")[0]
        v = arg.split("=")[1]
        if k == 'plot':
            plot = utils.str_to_bool(v)
        elif k == 'print':
            print_ = utils.str_to_bool(v)
        elif k == 'max_features':
            max_features = int(v)
        elif k == 'stop_words':
            if utils.str_to_bool(v) == False:
                stop_words = None
        elif k == 'random_state':
            random_state = int(v)
        elif k == 'order':
            order = int(v)
        elif k == 'wordcloud':
            wordcloud_ = utils.str_to_bool(v)
        elif k == 'wordcloud_n':
            wordcloud_n = int(v)
        elif k == 'cos_sim':
            cos_sim = utils.str_to_bool(v)
        elif k == 'font_size':
            plt.rcParams.update({'font.size': int(v)})
        elif k == 'even_distrib':
            even_distrib = utils.str_to_bool(v)
        elif k == 'pre_vec':
            pre_vec = utils.str_to_bool(v)
        elif k == 'limit_size':
            limit_size = utils.str_to_bool(v)
        elif k == 'min_df':
            min_df = int(v)
        elif k == 'max_df':
            max_df = float(v)
            if max_df > 1:
                max_df = int(max_df)
        elif k == 'param_compare':
            param_compare = utils.str_to_bool(v)
        else:
            print("Unknown param: {}".format(k))

    if print_:
        print()
        print("-- Analysis config --")
        print("even_distrib: {}".format(even_distrib))
        print("stop_words: {}".format(stop_words != None))
        print("max_features: {}".format(max_features))
        print("random_state: {}".format(random_state))
        print("wordcloud: {}".format(wordcloud_))
        print("wordcloud_n: {}".format(wordcloud_n))
        print("order: {}".format(order))
        print("cos_sim: {}".format(cos_sim))
        print("param_compare: {}".format(param_compare))
        print("pre_vec: {}".format(pre_vec))
        print("limit_size: {}".format(limit_size))
        print("min_df: {}".format(min_df))
        print("max_df: {}".format(max_df))
        print("plot: {}".format(plot))
        print("--------------------")
        print()

    gen_spotify_df = pd.read_csv(const.GEN_SPOTIFY)
    clean_spotify_df = pd.read_csv(const.CLEAN_SPOTIFY)
    if even_distrib == False:
        clean_spotify_df = pd.read_csv(const.CLEAN_UNEVEN_SPOTIFY)

    gen_deezer_df = pd.read_csv(const.GEN_DEEZER)
    clean_deezer_df = pd.read_csv(const.CLEAN_DEEZER)
    if even_distrib == False:
        clean_deezer_df = pd.read_csv(const.CLEAN_UNEVEN_DEEZER)

    datasets = [
        (const.SPOTIFY, clean_spotify_df),
        (const.DEEZER, clean_deezer_df),
    ]
    vectorizer = CountVectorizer(
        stop_words=stop_words,
        ngram_range=(1, 1),
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        binary=True,
    )

    # word clouds
    if wordcloud_:
        top_n = gen_word_cloud_grid(
            const.SPOTIFY,
            clean_spotify_df,
            vectorizer=vectorizer,
            n=wordcloud_n,
            order=order,
            random_state=random_state,
            print_=print_
        )
        spotify_shared, spotify_unique = get_shared_words(top_n)

        top_n = gen_word_cloud_grid(
            const.DEEZER,
            clean_deezer_df,
            vectorizer=vectorizer,
            n=wordcloud_n,
            order=order,
            random_state=random_state,
            print_=print_
        )
        deezer_shared, deezer_unique = get_shared_words(top_n)

        if print_:
            print()
            print("Spotify: count shared={}".format(
                len(spotify_shared)/len(spotify_unique)))
            print("Deezer: count shared={}".format(
                len(deezer_shared)/len(deezer_unique)))
            print()

    # cosine similarity
    if cos_sim: 
        for name, dataset in datasets:
            if pre_vec:
                dataset = utils.get_vectorized_df(dataset, vectorizer)

            print("{} class data similarity analysis...".format(name))
            for i in dataset.y.unique():
                class_df = utils.get_class_based_data(
                    dataset,
                    i,
                    random_state=random_state,
                    include_other_classes=True,
                    even_distrib=False,
                    limit_size=limit_size,
                    print_=True,
                )
                if pre_vec == False:
                    class_df = utils.get_vectorized_df(class_df, vectorizer)
                pos_df = utils.get_class_based_data(class_df, 1)
                pos_df.pop('y')
                ave_pos = utils.get_average_cos_sim(pos_df.values)
                neg_df = utils.get_class_based_data(class_df, -1.0)
                neg_df.pop('y')
                ave_neg = utils.get_average_cos_sim(neg_df.values)
                ave_between = utils.get_average_cos_sim(
                    pos_df.values, neg_df.values)
                print("class {}".format(i))
                print("data shape: {}".format(class_df.shape))
                print("average positive cosine similarity: {}".format(ave_pos))
                print("average negative cosine similarity: {}".format(ave_neg))
                print("average between cosine similarity: {}".format(ave_between))
                print("(pos - between )+ (neg - between) percentage = {} ".format(
                    (ave_pos - ave_between) / ave_pos + (ave_neg - ave_between)  / ave_neg
                ))
                print()

    if param_compare:
        # min_df vs pos_sim, neg_sim, between_sim
        params_grid = {
            'min_df': [i for i in range(1, 15)],
            'max_df': np.arange(0.1, 1.0, 0.1),
        }

        for name, dataset in datasets:    
            for i in dataset.y.unique():
                df = utils.get_class_based_data(
                    dataset,
                    i,
                    random_state=random_state,
                    include_other_classes=True,
                    even_distrib=False,
                    limit_size=limit_size,
                )
                for p, v in params_grid.items():
                    print("Comparing cosine similarity vs {} for {} Class {} data...".format(p, name, i))
                    vectorizer = CountVectorizer(
                        stop_words=stop_words,
                        ngram_range=(1, 1),
                        min_df=min_df,
                        max_df=max_df,
                        max_features=max_features,
                        binary=True,
                    )
                    pos_sim = []
                    neg_sim = []
                    between_sim = []
                    diff = []
                    for j in range(len(v)):
                        vectorizer.set_params(**{p: v[j]})
                        class_df = utils.get_vectorized_df(df, vectorizer)
                        pos_df = utils.get_class_based_data(class_df, 1)
                        pos_df.pop('y')
                        ave_pos = utils.get_average_cos_sim(pos_df.values)
                        neg_df = utils.get_class_based_data(class_df, -1.0)
                        neg_df.pop('y')
                        ave_neg = utils.get_average_cos_sim(neg_df.values)
                        ave_between = utils.get_average_cos_sim(
                            pos_df.values, neg_df.values)
                        pos_sim.append(ave_pos)
                        neg_sim.append(ave_neg)
                        between_sim.append(ave_between)
                        diff.append((ave_pos - ave_between)/ave_pos + (ave_neg - ave_between)/ave_neg)
                    
                    plt.figure()
                    plt.title("{} Class {}: {} vs cosine similarity".format(name,i, p))
                    pos_sim = np.array(list(zip(v, pos_sim)))
                    neg_sim = np.array(list(zip(v, neg_sim)))
                    between_sim = np.array(list(zip(v, between_sim)))
                    diff = np.array(list(zip(v, diff)))
                    plt.plot(pos_sim[:, 0], pos_sim[:, 1], label='pos sim')
                    plt.plot(neg_sim[:, 0], neg_sim[:, 1], label='neg sim')
                    plt.plot(between_sim[:, 0], between_sim[:, 1], label='between sim')
                    plt.plot(diff[:, 0], diff[:, 1], label='sim difference (%)')
                    plt.xlabel(p)
                    plt.legend()            

    # grid search eval
    if plot:
        plt.draw()
        plt.show()