def classify_lyrics_pos(genre_lyrics_map): vectorizer = DictVectorizer() all_lyrics_pos_tags = [] all_lyrics_genres = [] for genre in genre_lyrics_map.keys(): genre_lyrics = genre_lyrics_map[genre] for song_lyrics in genre_lyrics: pos_tags_map = song_lyrics["features"]["pos_tags_map"] all_lyrics_pos_tags.append(pos_tags_map) all_lyrics_genres.append(genre) pos_train, pos_test, genres_train, genres_test = train_test_split(all_lyrics_pos_tags, all_lyrics_genres, test_size=0.33) vectorizer.fit(all_lyrics_pos_tags) vect = vectorizer.transform((all_lyrics_pos_tags)) print("vect = " + str(vect)) classifiers_to_use = get_classifiers() partial_fit_classifiers = classifiers_to_use["partial"] full_fit_classifiers = classifiers_to_use["full"] teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_train, genres_train, app_data.LYRICS_GENRES_METAL) test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_test, genres_test) print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)
def load_vocab_vectorizer(train_set,pickle=True,extra_label="default"): train_dv=DictVectorizer() words=[dict(itertools.chain(*(train_set_obj.attr_map.items() for train_set_obj in train_set.objects())))] #fit the dv first train_dv.fit(words) print("vocab length is {}".format(len(train_dv.feature_names_))) del words pickle and pickle_dv(train_dv,extra_label) return train_dv
def classify_lyrics_mixed_features( genre_lyrics_map): vectorizer = DictVectorizer() all_lyrics_features = [] all_lyrics_genres = [] for genre in genre_lyrics_map.keys(): genre_lyrics = genre_lyrics_map[genre] for song_lyrics in genre_lyrics: features = song_lyrics["features"] song_features_map = {} for feature_name, feature_value in features.items(): # pos_tags_map is a dictionary - merge it with song_features_map dictionary if feature_name == "pos_tags_map": song_features_map.update(feature_value) # All other features are numeric, add their name and value as a new key-value pair to the song_features_map else: song_features_map[feature_name] = feature_value print("Features: " + str(song_features_map)) all_lyrics_features.append(song_features_map) all_lyrics_genres.append(genre) features_train, features_test, genres_train, genres_test = train_test_split(all_lyrics_features, all_lyrics_genres, test_size=0.33) vectorizer.fit(all_lyrics_features) classifiers_to_use = get_classifiers() partial_fit_classifiers = classifiers_to_use["partial"] full_fit_classifiers = classifiers_to_use["full"] teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_train, genres_train, app_data.LYRICS_GENRES_METAL) test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_test, genres_test) print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL) print_classification_report(full_fit_classifiers[0], vectorizer, features_test, genres_test)