コード例 #1
0
def classify_lyrics_pos(genre_lyrics_map):

    vectorizer = DictVectorizer()

    all_lyrics_pos_tags = []
    all_lyrics_genres   = []

    for genre in genre_lyrics_map.keys():

        genre_lyrics = genre_lyrics_map[genre]

        for song_lyrics in genre_lyrics:

            pos_tags_map = song_lyrics["features"]["pos_tags_map"]

            all_lyrics_pos_tags.append(pos_tags_map)
            all_lyrics_genres.append(genre)

    pos_train, pos_test, genres_train, genres_test = train_test_split(all_lyrics_pos_tags, all_lyrics_genres, test_size=0.33)

    vectorizer.fit(all_lyrics_pos_tags)
    vect = vectorizer.transform((all_lyrics_pos_tags))
    print("vect = " + str(vect))

    classifiers_to_use      = get_classifiers()
    partial_fit_classifiers = classifiers_to_use["partial"]
    full_fit_classifiers    = classifiers_to_use["full"]

    teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_train, genres_train, app_data.LYRICS_GENRES_METAL)

    test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_test, genres_test)

    print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)
コード例 #2
0
ファイル: titanic.py プロジェクト: smivv/kaggle-titanic
def cat_vectorize(train_data, test_data, num_cols):
    # categorical attributes
    cat_train_data = train_data.drop(num_cols, axis=1)
    cat_test_data = test_data.drop(num_cols, axis=1)

    cat_train_data.fillna('NA', inplace=True)
    cat_test_data.fillna('NA', inplace=True)

    cat_train_data_values = cat_train_data.T.to_dict().values()
    cat_test_data_values = cat_test_data.T.to_dict().values()

    # vectorize (encode as one hot)
    vectorizer = DictVectorizer(sparse=False)
    vec_train_data = vectorizer.fit_transform(cat_train_data_values)
    vec_test_data = vectorizer.transform(cat_test_data_values)

    return vec_train_data, vec_test_data
コード例 #3
0
class EntityDetectionPU(ClassifierMixin):
    def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200):
        self.clf = SparsePU_SL(prior=prior,
                               sigma=sigma,
                               lam=lam,
                               basis=basis,
                               n_basis=n_basis)
        # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis)
        # self.clf = SVC()
        self.featureizer = DictVectorizer(sparse=True)

    def fit(self, X, y):
        x_feat = self.featureizer.fit_transform(X)
        self.clf.fit(x_feat, y)
        return self

    def predict(self, X):
        x_feat = self.featureizer.transform(X)
        return self.clf.predict(x_feat)

    def predict_sent(self, X_sent):
        for xi in X_sent:
            yield (self.predict([xi]))
コード例 #4
0
# categorical attributes

cat_train = X_train.drop(numeric_cols, axis=1)
cat_test = X_test.drop(numeric_cols, axis=1)

cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)

x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()

# vectorize (encode as one hot)

vectorizer = DictVectorizer(sparse=False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test)

# build the feature vector

x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))


#clfLR = LogisticRegression().fit(x_train, y_train.values)
#pred = clfLR.predict(x_test)
#print classification_report(y_test.values, pred, digits=4)
#print accuracy_score(y_test.values, pred)

clfTree = tree.DecisionTreeClassifier().fit(x_train, y_train)
predict = clfTree.predict(x_test)
#print classification_report(y_test.values, pred, digits=4)
コード例 #5
0
class Model(object):
    """
    Class for abstracting the different classification models.
    """
    
    def __init__(self, train_tweets, train_targets, vect_options, tfidf_options, extra_params):
        self.grid_params = {
#                            'vect__ngram_range': [(1,1),(1,2),(2,2)],
#                      'tfidf__use_idf': (True,False),
#                      'tfidf__smooth_idf': (True, False),
#                      'tfidf__sublinear_tf': (True, False),
                      }
        
        self.grid_params = dict(self.grid_params.items()+extra_params.items())
        self.vect_options = vect_options
        self.tfidf_options = tfidf_options
        self.feature_set = {}
        self.train_tweets = train_tweets
        self.train_targets = train_targets
        self.only_text_features = False
        
    def train_on_feature_set(self, cross_validate=True, use_tfidf=True):
        """
        Performs training with the given model using the given feature set
        """
        #Establish document text feature vectors
        print "Vectorizing"
#        self.tokenizer = CountVectorizer().build_tokenizer()
        
        
        self.vect = CountVectorizer(**self.vect_options)
        self.tfidf_transformer = TfidfTransformer(**self.tfidf_options)
        self.dict_transformer = TfidfTransformer(**self.tfidf_options)
#        train_counts_tf = tfidf_transformer.fit_transform(train_counts)
        
        count_vector = self.vect.fit_transform([t.text for t in self.train_tweets])
        tfidf_count = self.tfidf_transformer.fit_transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            self.dict_vectorizer = DictVectorizer()
            dict_vector = self.dict_vectorizer.fit_transform(self.feature_set)
            
            f=codecs.open("feature_set.txt", "w", "utf8")
            for d in dict_vector:
                f.write(d.__str__())
            f.close()
            tfidf_dict = self.dict_transformer.fit_transform(dict_vector)
            f=codecs.open("feature_set_tdidf.txt", "w", "utf8")
            for d in tfidf_dict:
                f.write(d.__str__())
            f.close()
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
#        combined_features = FeatureUnion()
        #Crossvalidation
        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
#                                        ('vect', self.vect),
#                                    ('tfidf', self.tfidf_transformer),
                                    ('clf', self.classifier)
                                    ])
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit(combined_vector, self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score)
        return self.grid
        
    def grid_search_on_text_features(self, cross_validate=True, file_postfix=""):
        """
        Performs a grid search using text features on the given dataset. Stores the parameters for the optimal classifier.
        """
        
        self.grid_params = {
                    'vect__ngram_range': [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4)],
              'vect__use_idf': (True,False),
              'vect__smooth_idf': (True, False),
              'vect__sublinear_tf': (True, False),
              'vect__max_df': (0.5,),
              }
        self.vect = TfidfVectorizer()

        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
                                        ('vect', self.vect),
                                    ('clf', self.classifier)]
                                       )
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit([t.text for t in self.train_tweets], self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "        
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score, file_postfix=file_postfix)
        return self.grid

    def classify(self, tweets, sentimentvalues=None):
        """
        Performs the classification process on list of tweets.
        """
        if sentimentvalues!=None:
            self.test_words_and_values = sentimentvalues
        count_vector = self.vect.transform([t.text for t in tweets])
        tfidf_count = self.tfidf_transformer.transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)])
            tfidf_dict = self.dict_transformer.transform(dict_vector)
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
                
        predictions = self.best_estimator.predict(combined_vector)

        return predictions

    def classify_text(self, texts):
        """
        Performs classification with only text features.
        """
        
        count_vector = self.vect.transform([t for t in texts])
        text_vector = self.tfidf_transformer.transform(count_vector)
        predictions = self.best_estimator.predict(text_vector)

        return predictions
        
    def test_and_return_results(self, test_tweets, test_targets, sentimentvalues):
        """
        Tests the classifier on a given test set, and returns the accuracy, precision, recall, and f1 score.
        """
        self.test_words_and_values = sentimentvalues
        predictions = self.classify(test_tweets)
        binary_predictions = utils.reduce_targets(predictions)
        binary_test_targets = utils.reduce_targets(test_targets)
        
        accuracy = metrics.accuracy_score(binary_test_targets, binary_predictions)
        precision = metrics.precision_score(binary_test_targets, binary_predictions)
        recall = metrics.recall_score(binary_test_targets, binary_predictions)
        f1_score = metrics.f1_score(binary_test_targets, binary_predictions)
        print "Scores:  ", accuracy, precision, recall, f1_score
        
        return accuracy, precision, recall, f1_score
    
    def get_correctly_classified_tweets(self, tweets_and_sentiment):
        """
        Classifies the given set of tweets and returns the ones that were correctly classified.
        """
        tweets, sentimentvalues = zip(*tweets_and_sentiment)
        if sentimentvalues!=None:
            self.test_words_and_values = sentimentvalues
        count_vector = self.vect.transform([t.text for t in tweets])
        tfidf_count = self.tfidf_transformer.transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)])
            tfidf_dict = self.dict_transformer.transform(dict_vector)
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
                
        predictions = self.best_estimator.predict(combined_vector)
        tweets, targets = utils.make_subjectivity_targets(tweets)
        #return the tweets where the target match prediction
        correct_tweets = []
        correct_sentimentvalues = []
        for i in xrange(len(tweets)):
            if predictions[i]==targets[i]:
                correct_tweets.append(tweets[i])
                correct_sentimentvalues.append(sentimentvalues[i])
        return correct_tweets, correct_sentimentvalues
    
    def set_feature_set(self, featureset, sentimentvalues):
        """
        Extracts and stores the given feature set for classification.
        """
        self.featureset = featureset
        if featureset=='SA' or featureset=='PA':
            self.only_text_features=True
            self.feature_set = {}
        else:
            words_and_values = sentimentvalues
            self.feature_set = [features.get_feature_set(t, self.featureset, v) for t,v in zip(self.train_tweets,words_and_values)]
        
                
コード例 #6
0
test_features = [id_features[id] for id in test_ids]
labels = {'0': 0, '1': 1, '2': 2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt',
               X_train.toarray(),
               delimiter=',',
               header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
if do_feature_elimination:
    estimator = RandomForestClassifier(n_estimators=2000,
                                       criterion='entropy',
                                       max_depth=None,
                                       min_samples_split=16,
                                       min_samples_leaf=1,
                                       min_weight_fraction_leaf=0.0,
                                       max_features='auto',
コード例 #7
0
ファイル: data_loader.py プロジェクト: afshinrahimi/telstra
train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0':0, '1':1, '2':2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
if do_feature_elimination:
    estimator =  RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, 
                                 min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, 
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)
    selector = RFECV(estimator, step=1, cv=5, scoring='log_loss')
    X_train = selector.fit_transform(X_train, train_labels)
    print 'after feature elimination', X_train.shape