def test_corpus_stop_list_entropy(self): """Test production of stoplists from a corpus, using basis: entropy""" target_list = ['ac', 'ad', 'atque', 'cum', 'et', 'in', 'mihi', 'qui', 'rerum', 'vel'] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10, basis='entropy', inc_values=False) self.assertEqual(stoplist, target_list)
def main(): ap = argparse.ArgumentParser() ap.add_argument("-d", "--dataset", required=True, help="path to input dataset") ap.add_argument("-l", "--length", required=True, help="length of sw list") args = vars(ap.parse_args()) print("[INFO] Importing data...") filepath = args["dataset"] C = Corpus(filepath) data, _ = C.read() data = [file.lower() for file in data] pat0 = re.compile(r"\W+", flags=re.MULTILINE) data = [pat0.sub(" ", file) for file in data] pat1 = re.compile(r"\d+", flags=re.MULTILINE) data = [pat1.sub(" ", file) for file in data] pat2 = re.compile(r" +", flags=re.MULTILINE) data = [pat2.sub(" ", file) for file in data] print("[INFO] Lemmatization...") le = LemmatizerLatin(token=False) data = [le.preprocess(file) for file in data] S = CorpusStoplist() sw_list = S.build_stoplist(data, size=int(args["length"])) print("[INFO] Writing list to file...") if os.path.isdir("../res"): pass else: os.mkdir("../res") with open("../res/stopwords.txt", "w") as f: for word in sw_list: f.write("%s\n" % word)
def test_corpus_stop_list_freq_sort_words(self): """Test production of stoplists from a corpus, using basis: frequency""" target_list = ['in', 'et', 'vel', 'ac', 'cum', 'qui', 'atque', 'mihi', 'ad', 'neque'] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10, basis='frequency', inc_values=False, sort_words=False) self.assertEqual(stoplist, target_list)
def test_corpus_latin(self): """Test production of Latin stoplists from a corpus""" target_list = ['ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque', 'qui', 'rerum', 'vel'] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.latin_test_corpus, size=10, basis='zou', inc_values=False) self.assertEqual(stoplist, target_list)
def test_corpus_stop_list_freq_inc_values(self): """Test production of stoplists from a corpus with values, using basis: frequency""" target_list = [('ac', 8), ('ad', 5), ('atque', 6), ('cum', 8), ('et', 15), ('in', 18), ('mihi', 6), ('neque', 5), ('qui', 7), ('vel', 9)] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10, basis='frequency', inc_values=True) self.assertEqual(stoplist, target_list)
def test_corpus_stop_list_variance(self): """Test production of stoplists from a corpus, using basis: variance""" target_list = [ 'ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque', 'qui', 'rerum', 'vel' ] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10, basis='variance') self.assertEqual(stoplist, target_list)
def test_corpus_stop_list_freq_include(self): """Test production of stoplists from a corpus, using basis: frequency""" target_list = [ 'ac', 'ad', 'atque', 'cum', 'est', 'et', 'in', 'mihi', 'neque', 'qui', 'vel' ] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10, basis='frequency', include=['est']) self.assertEqual(stoplist, target_list)
def test_corpus_stop_list_variance(self): """Test production of stoplists from a corpus, using basis: variance""" target_list = ['ac', 'atque', 'cum', 'et', 'in', 'mihi', 'neque', 'qui', 'rerum', 'vel'] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10,basis='variance') self.assertEqual(stoplist, target_list)
def test_corpus_stop_list_freq_include(self): """Test production of stoplists from a corpus, using basis: frequency""" target_list = ['ac', 'ad', 'atque', 'cum', 'est', 'et', 'in', 'mihi', 'neque', 'qui', 'vel'] S = LatinCorpusStoplist() stoplist = S.build_stoplist(self.test_corpus, size=10, basis='frequency', include=['est']) self.assertEqual(stoplist, target_list)