Ejemplo n.º 1
0
 def test_case_7(self):
     '''
     Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 0
     '''
     language = 0
     language_str = "english"
     version = 0
     obj = build_celex(TEST_CELEX_PATH, language, version) 
     cls = EnglishCelex
     self.assertIsInstance(obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex")
Ejemplo n.º 2
0
 def test_case_7(self):
     '''
     Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 0
     '''
     language = 0
     language_str = "english"
     version = 0
     obj = build_celex(TEST_CELEX_PATH, language, version)
     cls = EnglishCelex
     self.assertIsInstance(
         obj, cls, "Language code " + str(language) + " should produce " +
         language_str + " celex")
Ejemplo n.º 3
0
    def test_case_8(self):
        '''
        Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 1
        '''
        language = 0
        language_str = "english"
        version = 1
        
        obj = build_celex(TEST_CELEX_PATH, language, version) 
        cls = EnglishCelex

        self.assertIsInstance(obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex")
        self.assertEquals(version, obj.version, "Expected: " + str(version) + " Found: " + str(obj.version))
        self.assertEquals(language_str, obj.language, "Expected: " + language_str + " Found: " + str(obj.language))
Ejemplo n.º 4
0
    def test_case_12(self):
        '''
        Valid Case: Assert GermanCelex used by factory when language is 1 and version is 2
        '''
        language = 1
        language_str = "german"
        version = 2

        obj = build_celex(TEST_CELEX_PATH, language, version) 
        cls = GermanCelex
        
        self.assertIsInstance(obj, cls, "Language code " + str(language) + " should produce " + language_str + " celex")
        self.assertEquals(version, obj.version, "Expected: " + str(version) + " Found: " + str(obj.version))
        self.assertEquals(language_str, obj.language, "Expected: " + language_str + " Found: " + str(obj.language))
Ejemplo n.º 5
0
    def test_case_8(self):
        '''
        Valid Case: Assert EnglishCelex used by factory when language is 0 and version is 1
        '''
        language = 0
        language_str = "english"
        version = 1

        obj = build_celex(TEST_CELEX_PATH, language, version)
        cls = EnglishCelex

        self.assertIsInstance(
            obj, cls, "Language code " + str(language) + " should produce " +
            language_str + " celex")
        self.assertEquals(
            version, obj.version,
            "Expected: " + str(version) + " Found: " + str(obj.version))
        self.assertEquals(
            language_str, obj.language,
            "Expected: " + language_str + " Found: " + str(obj.language))
Ejemplo n.º 6
0
    def test_case_12(self):
        '''
        Valid Case: Assert GermanCelex used by factory when language is 1 and version is 2
        '''
        language = 1
        language_str = "german"
        version = 2

        obj = build_celex(TEST_CELEX_PATH, language, version)
        cls = GermanCelex

        self.assertIsInstance(
            obj, cls, "Language code " + str(language) + " should produce " +
            language_str + " celex")
        self.assertEquals(
            version, obj.version,
            "Expected: " + str(version) + " Found: " + str(obj.version))
        self.assertEquals(
            language_str, obj.language,
            "Expected: " + language_str + " Found: " + str(obj.language))
                for word in bigram:
                    word = word.lower()
                    if word in english_words:
                        text += (word + ' ') * int(line[1])
                    else:
                        current_non_english_words.add(word)
                
        return (text, current_non_english_words)
    
    except IOError:
        print >> sys.stderr, "IOError while reading file: " + fname
        return None


# get the dictionary
english_words = build_celex(DROPBOX_FOLDER + "celex2/", 0, 0)

non_english_words = set()

# expand corpus files
for filee in sorted(os.listdir(ARTICLES_FOLDER + "bigrams/")):
    if not filee.endswith(".CSV"):
        continue
    
    fname = ARTICLES_FOLDER + "bigrams/" + filee

    print (fname)
    
    (text, current_non_eng_words) = expand_file(fname, english_words)
    non_english_words |= current_non_eng_words