def test_case_7(self): ''' Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 0 ''' language = 0 language_str = "english" version = 0 obj = build_celex(TEST_CELEX_PATH, language, version) cls = EnglishCelex self.assertIsInstance(obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex")
def test_case_7(self): ''' Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 0 ''' language = 0 language_str = "english" version = 0 obj = build_celex(TEST_CELEX_PATH, language, version) cls = EnglishCelex self.assertIsInstance( obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex")
def test_case_8(self): ''' Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 1 ''' language = 0 language_str = "english" version = 1 obj = build_celex(TEST_CELEX_PATH, language, version) cls = EnglishCelex self.assertIsInstance(obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex") self.assertEquals(version, obj.version, "Expected: " + str(version) + " Found: " + str(obj.version)) self.assertEquals(language_str, obj.language, "Expected: " + language_str + " Found: " + str(obj.language))
def test_case_12(self): ''' Valid Case: Assert GermanCelex used by factory when language is 1 and version is 2 ''' language = 1 language_str = "german" version = 2 obj = build_celex(TEST_CELEX_PATH, language, version) cls = GermanCelex self.assertIsInstance(obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex") self.assertEquals(version, obj.version, "Expected: " + str(version) + " Found: " + str(obj.version)) self.assertEquals(language_str, obj.language, "Expected: " + language_str + " Found: " + str(obj.language))
def test_case_8(self): ''' Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 1 ''' language = 0 language_str = "english" version = 1 obj = build_celex(TEST_CELEX_PATH, language, version) cls = EnglishCelex self.assertIsInstance( obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex") self.assertEquals( version, obj.version, "Expected: " + str(version) + " Found: " + str(obj.version)) self.assertEquals( language_str, obj.language, "Expected: " + language_str + " Found: " + str(obj.language))
def test_case_12(self): ''' Valid Case: Assert GermanCelex used by factory when language is 1 and version is 2 ''' language = 1 language_str = "german" version = 2 obj = build_celex(TEST_CELEX_PATH, language, version) cls = GermanCelex self.assertIsInstance( obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex") self.assertEquals( version, obj.version, "Expected: " + str(version) + " Found: " + str(obj.version)) self.assertEquals( language_str, obj.language, "Expected: " + language_str + " Found: " + str(obj.language))
for word in bigram: word = word.lower() if word in english_words: text += (word + ' ') * int(line[1]) else: current_non_english_words.add(word) return (text, current_non_english_words) except IOError: print >> sys.stderr, "IOError while reading file: " + fname return None # get the dictionary english_words = build_celex(DROPBOX_FOLDER + "celex2/", 0, 0) non_english_words = set() # expand corpus files for filee in sorted(os.listdir(ARTICLES_FOLDER + "bigrams/")): if not filee.endswith(".CSV"): continue fname = ARTICLES_FOLDER + "bigrams/" + filee print (fname) (text, current_non_eng_words) = expand_file(fname, english_words) non_english_words |= current_non_eng_words