def test_trigram_count(self): df = build_corpus(pd.DataFrame(data=self.raw_data)) ngram_extractor = NGram() ngram_extractor.extract_features(df, trigram_threshold=2) self.assertEqual([0, 0, 0, 0], df["hate_speech_trigrams"].tolist())
def test_corpus(self): corpus = build_corpus(pd.DataFrame(data=self.raw_data)) self.assertTrue( "p.m." in corpus.iloc[0]["tokens"]) # check correct punctuation removal self.assertTrue( "will" in corpus.iloc[1]["tokens"]) # check correct lemmatization self.assertTrue("http://www.spamlaws.com/state/summary.html" in corpus.iloc[2]["tokens"])
def test_pattern_count(self): df = build_corpus(pd.DataFrame(data=self.raw_data)) pattern_extractor = Pattern(min_pattern_size=2, max_pattern_size=2, threshold=2) pattern_extractor.extract_features(df) self.assertEqual([2, 2, 2, 1], df["pattern_count"].tolist())
def run_feature_extraction_create_corpus(run_from_scratch, df_preprocessed): """ Run corpus building if run_from_scratch=True """ if run_from_scratch: df_corpus = build_corpus(df_preprocessed) df_corpus.to_csv( str(get_project_root()) + "/data/extracted_features/corpus.csv") return df_corpus else: df_corpus = pd.read_csv( str(get_project_root()) + "/data/extracted_features/corpus.csv") return df_corpus
def __init__(self, num_topics=2, num_workers=4): self.num_topics = num_topics self.num_workers = num_workers def extract_features(self, df): """ Extracts LDA topics """ id2word = corpora.Dictionary(df["tokens"]) corpus = [id2word.doc2bow(doc) for doc in df["tokens"]] lda_model = LdaMulticore( corpus=corpus, id2word=id2word, num_topics=self.num_topics, workers=self.num_workers, ) df["topic"] = df["tokens"].apply( lambda tokens: np.argmax( [prob for (topic, prob) in lda_model[id2word.doc2bow(tokens)]] ) ) return df if __name__ == "__main__": df_dataset = pd.read_csv("../../data/preprocessed/dataset.csv", index_col=0) df_dataset = build_corpus(df_dataset) lda_topic_extractor = LDATopic( 2, ) lda_topic_extractor.extract_features(df_dataset) print(df_dataset)