Esempio n. 1
0
def test_get_files_in_train_and_test_sets():
    """Test equallity of file_dicts"""
    file_dict1 = corpus.get_files_in_train_and_test_sets()

    corpus2 = CPTCorpus(file_dict=file_dict1, topicLines=[0], opinionLines=[1])
    file_dict2 = corpus2.get_files_in_train_and_test_sets()

    assert_equal(file_dict1, file_dict2)
Esempio n. 2
0
def test_get_files_in_train_and_test_sets():
    """Test equallity of file_dicts"""
    file_dict1 = corpus.get_files_in_train_and_test_sets()

    corpus2 = CPTCorpus(file_dict=file_dict1,
                        topicLines=[0], opinionLines=[1])
    file_dict2 = corpus2.get_files_in_train_and_test_sets()

    assert_equal(file_dict1, file_dict2)
Esempio n. 3
0
def test_loop_over_testSet():
    """Test loop over documents in testSet"""
    corpus2 = CPTCorpus(persp_dirs, testSplit=20,
                        topicLines=[0], opinionLines=[1])
    for d, persp, d_p, doc in corpus2.testSet():
        pass

    yield assert_equal, d, 1
    yield assert_equal, persp, len(corpus.perspectives)-1
    yield assert_equal, d_p, 0
Esempio n. 4
0
def get_corpus(params):
    out_dir = params.get('outDir')
    files = glob(params.get('inputData'))

    if not os.path.isfile(out_dir.format('corpus.json')):
        corpus = CPTCorpus(files,
                           testSplit=params.get('testSplit'),
                           topicLines=params.get('topicLines'),
                           opinionLines=params.get('opinionLines'))
        minFreq = params.get('minFreq')
        removeTopTF = params.get('removeTopTF')
        removeTopDF = params.get('removeTopDF')
        if (not minFreq is None) or (not removeTopTF is None) or \
           (not removeTopDF is None):
            corpus.filter_dictionaries(minFreq=minFreq,
                                       removeTopTF=removeTopTF,
                                       removeTopDF=removeTopDF)
        corpus.save_dictionaries(directory=out_dir.format(''))
        corpus.save(out_dir.format('corpus.json'))
    else:
        corpus = CPTCorpus.load(file_name=out_dir.format('corpus.json'),
                                topicLines=params.get('topicLines'),
                                opinionLines=params.get('opinionLines'),
                                topicDict=out_dir.format('topicDict.dict'),
                                opinionDict=out_dir.format('opinionDict.dict'))
    return corpus
Esempio n. 5
0
def test_get_files_in_train_and_test_sets_testSplit():
    """Test equallity of file_dicts with testSplit"""
    corpus2 = CPTCorpus(persp_dirs, testSplit=40,
                        topicLines=[0], opinionLines=[1])
    file_dict1 = corpus2.get_files_in_train_and_test_sets()

    corpus3 = CPTCorpus(file_dict=file_dict1,
                        topicLines=[0], opinionLines=[1])
    file_dict2 = corpus3.get_files_in_train_and_test_sets()

    assert_equal(file_dict1, file_dict2)
Esempio n. 6
0
def test_loop_over_testSet():
    """Test loop over documents in testSet"""
    corpus2 = CPTCorpus(persp_dirs,
                        testSplit=20,
                        topicLines=[0],
                        opinionLines=[1])
    for d, persp, d_p, doc in corpus2.testSet():
        pass

    yield assert_equal, d, 1
    yield assert_equal, persp, len(corpus.perspectives) - 1
    yield assert_equal, d_p, 0
Esempio n. 7
0
def get_corpus(params):
    out_dir = params.get('outDir')
    files = glob(params.get('inputData'))

    if not os.path.isfile(out_dir.format('corpus.json')):
        corpus = CPTCorpus(files,
                           testSplit=params.get('testSplit'),
                           topicLines=params.get('topicLines'),
                           opinionLines=params.get('opinionLines'))
        minFreq = params.get('minFreq')
        removeTopTF = params.get('removeTopTF')
        removeTopDF = params.get('removeTopDF')
        if (not minFreq is None) or (not removeTopTF is None) or \
           (not removeTopDF is None):
            corpus.filter_dictionaries(minFreq=minFreq,
                                       removeTopTF=removeTopTF,
                                       removeTopDF=removeTopDF)
        corpus.save_dictionaries(directory=out_dir.format(''))
        corpus.save(out_dir.format('corpus.json'))
    else:
        corpus = CPTCorpus.load(file_name=out_dir.format('corpus.json'),
                                topicLines=params.get('topicLines'),
                                opinionLines=params.get('opinionLines'),
                                topicDict=out_dir.format('topicDict.dict'),
                                opinionDict=out_dir.format('opinionDict.dict'))
    return corpus
Esempio n. 8
0
def test_loading_of_dictionaries():
    """Test loading of the corpus wide dictionaries"""
    corpus2 = CPTCorpus(persp_dirs,
                        topicDict=corpus.topicDictionary,
                        opinionDict=corpus.opinionDictionary,
                        topicLines=[0],
                        opinionLines=[1])
    yield assert_equal, corpus.topicDictionary, corpus2.topicDictionary
    yield assert_equal, corpus.opinionDictionary, corpus2.opinionDictionary
Esempio n. 9
0
def test_load_corpus_from_file():
    """Test load corpus from json file"""
    file_dict1 = corpus.get_files_in_train_and_test_sets()

    fName = '{}/corpus.json'.format(data_dir)
    corpus.save(fName)

    corpus2 = CPTCorpus.load(fName, topicLines=[0], opinionLines=[1])
    file_dict2 = corpus2.get_files_in_train_and_test_sets()

    assert_equal(file_dict1, file_dict2)
Esempio n. 10
0
def test_illigal_values_for_testSplit():
    """No test set when value for testSplit parameter is illegal"""
    values = [-1, 0, 1000]
    for v in values:
        corpus2 = CPTCorpus(persp_dirs,
                            testSplit=v,
                            topicLines=[0],
                            opinionLines=[1])
        for p in corpus2.perspectives:
            yield assert_equal, p.testFiles, []
            yield assert_equal, p.testSet.input, []
Esempio n. 11
0
def test_loading_of_dictionaries_from_file():
    """Test loading of the corpus wide dictionaries from file"""
    corpus.save_dictionaries(directory=dict_dir)

    corpus2 = CPTCorpus(persp_dirs,
                        topicDict=corpus.topic_dict_file_name(dict_dir),
                        opinionDict=corpus.opinion_dict_file_name(dict_dir),
                        topicLines=[0],
                        opinionLines=[1])
    yield assert_equal, corpus.topicDictionary, corpus2.topicDictionary
    yield assert_equal, corpus.opinionDictionary, corpus2.opinionDictionary
Esempio n. 12
0
def test_load_corpus_from_file():
    """Test load corpus from json file"""
    file_dict1 = corpus.get_files_in_train_and_test_sets()

    fName = '{}/corpus.json'.format(data_dir)
    corpus.save(fName)

    corpus2 = CPTCorpus.load(fName, topicLines=[0], opinionLines=[1])
    file_dict2 = corpus2.get_files_in_train_and_test_sets()

    assert_equal(file_dict1, file_dict2)
Esempio n. 13
0
def test_testSet():
    """CPTCorpus with testSplit has train and test sets of particular length"""
    corpus2 = CPTCorpus(persp_dirs,
                        testSplit=20,
                        topicLines=[0],
                        opinionLines=[1])

    yield assert_equal, len(corpus2), 8

    for p in corpus2.perspectives:
        yield assert_equal, hasattr(p, 'testSet'), True
        yield assert_equal, len(p.trainSet), 4
        yield assert_equal, len(p.testSet), 1
Esempio n. 14
0
def setup():
    global data_dir
    global persp_dirs
    global dict_dir
    global documents
    global corpus

    data_dir = 'test_data/'
    persp_dirs = ['{}{}'.format(data_dir, p) for p in ('p0', 'p1')]
    documents = generateCPTCorpus.generate_cpt_corpus(data_dir)
    corpus = CPTCorpus(persp_dirs, topicLines=[0], opinionLines=[1])

    dict_dir = 'test_dict'
Esempio n. 15
0
def test_perplexity():
    """Minimal test of perplexity caluclation"""
    corpus2 = CPTCorpus(persp_dirs,
                        testSplit=20,
                        topicLines=[0],
                        opinionLines=[1])
    sampler2 = GibbsSampler(corpus2, nTopics=3, nIter=5)
    sampler2._initialize()
    sampler2.run()
    tw_perp, ow_perp = sampler2.perplexity()

    yield assert_true, tw_perp > 0.0
    yield assert_true, tw_perp < inf
    yield assert_true, ow_perp > 0.0
    yield assert_true, ow_perp < inf
Esempio n. 16
0
def test_get_files_in_train_and_test_sets_testSplit():
    """Test equallity of file_dicts with testSplit"""
    corpus2 = CPTCorpus(persp_dirs,
                        testSplit=40,
                        topicLines=[0],
                        opinionLines=[1])
    file_dict1 = corpus2.get_files_in_train_and_test_sets()

    corpus3 = CPTCorpus(file_dict=file_dict1, topicLines=[0], opinionLines=[1])
    file_dict2 = corpus3.get_files_in_train_and_test_sets()

    assert_equal(file_dict1, file_dict2)
Esempio n. 17
0
def setup():
    global data_dir
    global out_dir
    global persp_dirs
    global documents
    global corpus
    global sampler

    data_dir = 'test_data/'
    out_dir = 'test_output/'
    persp_dirs = ['{}{}'.format(data_dir, p) for p in ('p0', 'p1')]
    documents = generateCPTCorpus.generate_cpt_corpus(data_dir)
    corpus = CPTCorpus(persp_dirs, topicLines=[0], opinionLines=[1])
    sampler = GibbsSampler(corpus,
                           nTopics=3,
                           nIter=5,
                           out_dir=out_dir,
                           sample_interval=1)
    sampler._initialize()
    sampler.run()