def test_remove_keyword_via_file(self): self.word = KeyWords('./temp.csv') self.word.add_keyword('hello') self.word.remove_keyword('Hello') word2 = KeyWords('./temp.csv') self.assertEqual(word2.get_keywords(), [], "KeyWords couldn't remove keywords from a file")
class KeyWordUnit(unittest.TestCase): def setUp(self): self.word = KeyWords() self.file = open('temp.csv', 'w') def tearDown(self): self.word = None self.file.close() os.remove('temp.csv') def test_blank_init(self): self.assertEqual(self.word.get_keywords(), [], "KeyWords doesn't initialize empty correctly.") def test_bad_path(self): with self.assertRaises(FileNotFoundError): self.word = KeyWords('./BAD_PATH.NONEXISTENT') def test_init_type_error(self): with self.assertRaises(TypeError): self.word = KeyWords(69) def test_add_type_error(self): with self.assertRaises(TypeError): self.word.add_keyword(69) def test_remove_type_error(self): with self.assertRaises(TypeError): self.word.remove_keyword(69) def test_occurrence_type_error(self): with self.assertRaises(TypeError): self.word.occurrence(69) def test_good_path(self): self.word = KeyWords('./temp.csv') self.assertEqual( self.word.get_keywords(), [], "KeyWords couldn't access the newly made file correctly.") def test_single_word(self): self.file.write("hello") self.file.close() self.word = KeyWords('./temp.csv') self.assertEqual( self.word.get_keywords(), ['hello'], "KeyWords couldn't get keywords from the file correctly.") def test_multiple_words(self): self.file.write("hello,world") self.file.close() self.word = KeyWords('./temp.csv') self.assertEqual( self.word.get_keywords(), ['hello', 'world'], "KeyWords couldn't get keywords from the file " "correctly.") def test_added_keyword(self): self.word.add_keyword('hello') self.assertEqual(self.word.get_keywords(), ['hello'], "KeyWords couldn't add keywords correctly") def test_add_keyword_via_file(self): self.word = KeyWords('./temp.csv') self.word.add_keyword('hello') word2 = KeyWords('./temp.csv') self.assertEqual(word2.get_keywords(), ['hello'], "KeyWords couldn't add keywords correctly to a file") def test_removed_keyword(self): self.word.add_keyword('hello') self.word.remove_keyword('Hello') self.assertEqual(self.word.get_keywords(), [], "KeyWords couldn't remove a keyword correctly") def test_remove_keyword_via_file(self): self.word = KeyWords('./temp.csv') self.word.add_keyword('hello') self.word.remove_keyword('Hello') word2 = KeyWords('./temp.csv') self.assertEqual(word2.get_keywords(), [], "KeyWords couldn't remove keywords from a file") def test_single_occurrence(self): self.word.add_keyword('hello') self.assertEqual( self.word.occurrence( "Hello None of this hello text makes yhello much hElLo"), [('hello', 3)], "Couldn't count all instances of a keyword") def test_multi_occurrence(self): self.word.add_keyword('hello') self.word.add_keyword('world') self.assertEqual( self.word.occurrence( "Hello None world this helloworld text WoRld yhello much hElLo" ), [('hello', 2), ('world', 2)], "Couldn't count all instances of multiple keyword") def test_empty_occurrenceA(self): self.word.add_keyword('hello') self.assertEqual(self.word.occurrence(""), [('hello', 0)], "Couldn't handle empty text") def test_empty_occurrenceB(self): self.assertEqual(self.word.occurrence("This is a fun test text"), [], "Couldn't handle empty KeyWords")
# -*- coding: utf-8 -*- import sys import codecs if sys.stdout.encoding != 'cp850': sys.stdout = codecs.getwriter('cp850')(sys.stdout.buffer, 'strict') if sys.stderr.encoding != 'cp850': sys.stderr = codecs.getwriter('cp850')(sys.stderr.buffer, 'strict') from keywords import KeyWords from nltk.corpus import stopwords with open('script.txt', 'r') as f: data = f.read() with open('transcript_1.txt', 'r', encoding="utf8") as f1: corpus_1 = f1.read() with open('transcript_2.txt', 'r', encoding="utf8") as f2: corpus_2 = f2.read() with open('transcript_3.txt', 'r', encoding="utf8") as f3: corpus_3 = f3.read() stopWords = stopwords.words('english') keyword = KeyWords(corpus=corpus_1, stop_words=stopWords, alpha=0.8) d = keyword.get_keywords(data, n=20) for i in d: print("Keyword : %s \n Score : %f" %(i[0], i[1]))
def test_add_keyword_via_file(self): self.word = KeyWords('./temp.csv') self.word.add_keyword('hello') word2 = KeyWords('./temp.csv') self.assertEqual(word2.get_keywords(), ['hello'], "KeyWords couldn't add keywords correctly to a file")
# print(len(fdist)) k = int(len(fdist) / 2.8) top_k_words = fdist.most_common(k) # print(top_k_words[-10:]) top_k_words, _ = zip(*fdist.most_common(k)) top_k_words = set(top_k_words) dfToList = df['text'].tolist() final_list = [] for i in range(len(dfToList)): if i % 9 == 0: with open('testcorpus.txt', 'r', encoding="utf8") as f1: corpus_1 = f1.read() stopWords = stopwords.words('english') keyword = KeyWords(corpus=corpus_1, stop_words=stopWords, alpha=0.8) d = keyword.get_keywords(str(dfToList[i]), n=2) #final_list=[] for pair in d: for kw in word_tokenize(pair[0]): final_list.append(kw) ps = PorterStemmer() for i in range(len(final_list)): top_k_words.add(ps.stem(final_list[i])) # print(len(top_k_words)) # print(type(top_k_words)) df['tokenized'] = df['tokenized'].apply(keep_top_k_words) df['doc_len'] = df['tokenized'].apply(lambda x: len(x)) doc_lengths = list(df['doc_len'])