def test_get_files_in_train_and_test_sets(): """Test equallity of file_dicts""" file_dict1 = corpus.get_files_in_train_and_test_sets() corpus2 = CPTCorpus(file_dict=file_dict1, topicLines=[0], opinionLines=[1]) file_dict2 = corpus2.get_files_in_train_and_test_sets() assert_equal(file_dict1, file_dict2)
def test_loop_over_testSet(): """Test loop over documents in testSet""" corpus2 = CPTCorpus(persp_dirs, testSplit=20, topicLines=[0], opinionLines=[1]) for d, persp, d_p, doc in corpus2.testSet(): pass yield assert_equal, d, 1 yield assert_equal, persp, len(corpus.perspectives)-1 yield assert_equal, d_p, 0
def get_corpus(params): out_dir = params.get('outDir') files = glob(params.get('inputData')) if not os.path.isfile(out_dir.format('corpus.json')): corpus = CPTCorpus(files, testSplit=params.get('testSplit'), topicLines=params.get('topicLines'), opinionLines=params.get('opinionLines')) minFreq = params.get('minFreq') removeTopTF = params.get('removeTopTF') removeTopDF = params.get('removeTopDF') if (not minFreq is None) or (not removeTopTF is None) or \ (not removeTopDF is None): corpus.filter_dictionaries(minFreq=minFreq, removeTopTF=removeTopTF, removeTopDF=removeTopDF) corpus.save_dictionaries(directory=out_dir.format('')) corpus.save(out_dir.format('corpus.json')) else: corpus = CPTCorpus.load(file_name=out_dir.format('corpus.json'), topicLines=params.get('topicLines'), opinionLines=params.get('opinionLines'), topicDict=out_dir.format('topicDict.dict'), opinionDict=out_dir.format('opinionDict.dict')) return corpus
def test_get_files_in_train_and_test_sets_testSplit(): """Test equallity of file_dicts with testSplit""" corpus2 = CPTCorpus(persp_dirs, testSplit=40, topicLines=[0], opinionLines=[1]) file_dict1 = corpus2.get_files_in_train_and_test_sets() corpus3 = CPTCorpus(file_dict=file_dict1, topicLines=[0], opinionLines=[1]) file_dict2 = corpus3.get_files_in_train_and_test_sets() assert_equal(file_dict1, file_dict2)
def test_loop_over_testSet(): """Test loop over documents in testSet""" corpus2 = CPTCorpus(persp_dirs, testSplit=20, topicLines=[0], opinionLines=[1]) for d, persp, d_p, doc in corpus2.testSet(): pass yield assert_equal, d, 1 yield assert_equal, persp, len(corpus.perspectives) - 1 yield assert_equal, d_p, 0
def test_loading_of_dictionaries(): """Test loading of the corpus wide dictionaries""" corpus2 = CPTCorpus(persp_dirs, topicDict=corpus.topicDictionary, opinionDict=corpus.opinionDictionary, topicLines=[0], opinionLines=[1]) yield assert_equal, corpus.topicDictionary, corpus2.topicDictionary yield assert_equal, corpus.opinionDictionary, corpus2.opinionDictionary
def test_load_corpus_from_file(): """Test load corpus from json file""" file_dict1 = corpus.get_files_in_train_and_test_sets() fName = '{}/corpus.json'.format(data_dir) corpus.save(fName) corpus2 = CPTCorpus.load(fName, topicLines=[0], opinionLines=[1]) file_dict2 = corpus2.get_files_in_train_and_test_sets() assert_equal(file_dict1, file_dict2)
def test_illigal_values_for_testSplit(): """No test set when value for testSplit parameter is illegal""" values = [-1, 0, 1000] for v in values: corpus2 = CPTCorpus(persp_dirs, testSplit=v, topicLines=[0], opinionLines=[1]) for p in corpus2.perspectives: yield assert_equal, p.testFiles, [] yield assert_equal, p.testSet.input, []
def test_loading_of_dictionaries_from_file(): """Test loading of the corpus wide dictionaries from file""" corpus.save_dictionaries(directory=dict_dir) corpus2 = CPTCorpus(persp_dirs, topicDict=corpus.topic_dict_file_name(dict_dir), opinionDict=corpus.opinion_dict_file_name(dict_dir), topicLines=[0], opinionLines=[1]) yield assert_equal, corpus.topicDictionary, corpus2.topicDictionary yield assert_equal, corpus.opinionDictionary, corpus2.opinionDictionary
def test_testSet(): """CPTCorpus with testSplit has train and test sets of particular length""" corpus2 = CPTCorpus(persp_dirs, testSplit=20, topicLines=[0], opinionLines=[1]) yield assert_equal, len(corpus2), 8 for p in corpus2.perspectives: yield assert_equal, hasattr(p, 'testSet'), True yield assert_equal, len(p.trainSet), 4 yield assert_equal, len(p.testSet), 1
def setup(): global data_dir global persp_dirs global dict_dir global documents global corpus data_dir = 'test_data/' persp_dirs = ['{}{}'.format(data_dir, p) for p in ('p0', 'p1')] documents = generateCPTCorpus.generate_cpt_corpus(data_dir) corpus = CPTCorpus(persp_dirs, topicLines=[0], opinionLines=[1]) dict_dir = 'test_dict'
def test_perplexity(): """Minimal test of perplexity caluclation""" corpus2 = CPTCorpus(persp_dirs, testSplit=20, topicLines=[0], opinionLines=[1]) sampler2 = GibbsSampler(corpus2, nTopics=3, nIter=5) sampler2._initialize() sampler2.run() tw_perp, ow_perp = sampler2.perplexity() yield assert_true, tw_perp > 0.0 yield assert_true, tw_perp < inf yield assert_true, ow_perp > 0.0 yield assert_true, ow_perp < inf
def setup(): global data_dir global out_dir global persp_dirs global documents global corpus global sampler data_dir = 'test_data/' out_dir = 'test_output/' persp_dirs = ['{}{}'.format(data_dir, p) for p in ('p0', 'p1')] documents = generateCPTCorpus.generate_cpt_corpus(data_dir) corpus = CPTCorpus(persp_dirs, topicLines=[0], opinionLines=[1]) sampler = GibbsSampler(corpus, nTopics=3, nIter=5, out_dir=out_dir, sample_interval=1) sampler._initialize() sampler.run()