def classify_lyrics_pos(genre_lyrics_map):

    vectorizer = DictVectorizer()

    all_lyrics_pos_tags = []
    all_lyrics_genres   = []

    for genre in genre_lyrics_map.keys():

        genre_lyrics = genre_lyrics_map[genre]

        for song_lyrics in genre_lyrics:

            pos_tags_map = song_lyrics["features"]["pos_tags_map"]

            all_lyrics_pos_tags.append(pos_tags_map)
            all_lyrics_genres.append(genre)

    pos_train, pos_test, genres_train, genres_test = train_test_split(all_lyrics_pos_tags, all_lyrics_genres, test_size=0.33)

    vectorizer.fit(all_lyrics_pos_tags)
    vect = vectorizer.transform((all_lyrics_pos_tags))
    print("vect = " + str(vect))

    classifiers_to_use      = get_classifiers()
    partial_fit_classifiers = classifiers_to_use["partial"]
    full_fit_classifiers    = classifiers_to_use["full"]

    teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_train, genres_train, app_data.LYRICS_GENRES_METAL)

    test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_test, genres_test)

    print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)
Example #2
0
def load_vocab_vectorizer(train_set,pickle=True,extra_label="default"):
    train_dv=DictVectorizer()

    words=[dict(itertools.chain(*(train_set_obj.attr_map.items() for train_set_obj in train_set.objects())))]
    #fit the dv first
    train_dv.fit(words)
    print("vocab length is {}".format(len(train_dv.feature_names_)))

    del words

    pickle and pickle_dv(train_dv,extra_label)

    return train_dv
def classify_lyrics_mixed_features( genre_lyrics_map):

    vectorizer = DictVectorizer()

    all_lyrics_features = []
    all_lyrics_genres   = []

    for genre in genre_lyrics_map.keys():

        genre_lyrics = genre_lyrics_map[genre]

        for song_lyrics in genre_lyrics:

            features = song_lyrics["features"]

            song_features_map = {}

            for feature_name, feature_value in features.items():

                # pos_tags_map is a dictionary - merge it with song_features_map dictionary
                if feature_name == "pos_tags_map":
                    song_features_map.update(feature_value)

                # All other features are numeric, add their name and value as a new key-value pair to the song_features_map
                else:
                    song_features_map[feature_name] = feature_value

            print("Features: " + str(song_features_map))
            all_lyrics_features.append(song_features_map)
            all_lyrics_genres.append(genre)


    features_train, features_test, genres_train, genres_test = train_test_split(all_lyrics_features, all_lyrics_genres, test_size=0.33)

    vectorizer.fit(all_lyrics_features)

    classifiers_to_use = get_classifiers()
    partial_fit_classifiers = classifiers_to_use["partial"]
    full_fit_classifiers = classifiers_to_use["full"]

    teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_train, genres_train, app_data.LYRICS_GENRES_METAL)

    test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_test, genres_test)

    print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)

    print_classification_report(full_fit_classifiers[0], vectorizer, features_test, genres_test)