def predict(self, lyrics: str) -> Optional[Tuple[np.ndarray, np.ndarray]]: lyrics_fragments = fragmentize_text(lyrics) lyrics_fragments = [ preprocess(fragment, remove_punctuation=True, remove_text_in_brackets=True) for fragment in lyrics_fragments ] if self._removing_stop_words: lyrics_fragments = [ remove_stop_words(fragment) for fragment in lyrics_fragments ] if self._lemmatization: lyrics_fragments = [ lemmatize_text(fragment) for fragment in lyrics_fragments ] remove_empty_fragments(lyrics_fragments) if not lyrics_fragments: return None else: x, x_lens = self._get_embeddings(lyrics_fragments) result = self(x, x_lens) probs = torch.squeeze(result) label = probs.argmax(dim=-1, keepdim=True) return label.data.numpy(), probs.data.numpy()
def _get_fragmentized_lyrics_and_emotion_data( song_df: pd.DataFrame, removing_stop_words: bool, lemmatization: bool) -> Tuple[np.ndarray, List[List[str]]]: emotion_labels = list(song_df['emotion_4Q'].values) lyrics_data = song_df['lyrics'].values lyrics_data = [fragmentize_text(lyrics) for lyrics in lyrics_data] lyrics_data = [[ preprocess(fragment, remove_punctuation=True, remove_text_in_brackets=True) for fragment in fragments ] for fragments in lyrics_data] if removing_stop_words: lyrics_data = [[ remove_stop_words(fragment) for fragment in fragments ] for fragments in lyrics_data] if lemmatization: lyrics_data = [[ lemmatize_text(fragment) for fragment in fragments ] for fragments in lyrics_data] for fragments in lyrics_data: remove_empty_fragments(fragments) FragmentizedLyricsDataset._remove_records_without_fragments( emotion_labels, lyrics_data) emotion_data = label_encoder.transform(emotion_labels) return emotion_data, lyrics_data
def _preprocess_lyrics_in_df(self, song_df: pd.DataFrame, lemmatization: bool, removing_stop_words: bool): song_df['lyrics'] = song_df['lyrics'].apply( lambda x: preprocess(x, remove_punctuation=True, remove_text_in_brackets=True)) if removing_stop_words: song_df['lyrics'] = song_df['lyrics'].apply(lambda x: remove_stop_words(x)) if lemmatization: song_df['lyrics'] = song_df['lyrics'].apply(lambda x: lemmatize_text(x)) song_df = song_df[song_df['lyrics'] != ''] return song_df
def predict(self, lyrics: str) -> Optional[Tuple[np.ndarray, np.ndarray]]: lyrics = preprocess(lyrics, remove_punctuation=True, remove_text_in_brackets=True) if self._removing_stop_words: lyrics = remove_stop_words(lyrics) if self._lemmatization: lyrics = lemmatize_text(lyrics) if lyrics == '': return None else: padded_embeddings, length = self._get_padded_embeddings_sequence_and_length(lyrics) res = torch.squeeze(self(padded_embeddings, length)) probs = torch.softmax(res, dim=-1) label = probs.argmax(dim=-1, keepdim=True) return label.data.numpy(), probs.data.numpy()
def create_fasttext_model(large_dataset: bool, remove_stopwords: bool, lemmatization: bool, dim: int = 200) -> None: model_filename = 'fasttext_model_' + str(dim) if large_dataset: train_dataset_filepath = os.path.join(PROJECT_DIR, 'datasets', 'lyrics-data', 'lyrics-data.csv') df = pd.read_csv(train_dataset_filepath, index_col=0) df = df[df['Idiom'] == 'ENGLISH'] lyric_column_name = 'Lyric' model_filename += '_large' else: train_dataset_filepath = os.path.join(PROJECT_DIR, 'datasets', 'train_dataset.csv') df = pd.read_csv(train_dataset_filepath, index_col=0) lyric_column_name = 'lyrics' df[lyric_column_name] = df.apply( lambda x: preprocess(x[lyric_column_name], remove_punctuation=True, remove_text_in_brackets=True), axis=1) if remove_stopwords: df[lyric_column_name] = df.apply( lambda x: remove_stop_words(x[lyric_column_name]), axis=1) model_filename += '_stopwords_removed' if lemmatization: df[lyric_column_name] = df.apply( lambda x: lemmatize_text(x[lyric_column_name]), axis=1) model_filename += '_lemmatization' model_filename += '.bin' lyrics_data = df[lyric_column_name].values with open(TEMP_LYRICS_FILENAME, 'w', encoding='utf-8') as f: for lyric in lyrics_data: f.write(lyric) model = fasttext.train_unsupervised(TEMP_LYRICS_FILENAME, dim=dim, minn=2) model_output = os.path.join(PROJECT_DIR, 'models', 'word_embedding', 'saved_models', model_filename) model.save_model(model_filename) shutil.move(model_filename, model_output) os.remove(TEMP_LYRICS_FILENAME)