class TestBigramWorker(unittest.TestCase):
    def _prepare_store(self):
        self.db_conf = db_conf = default_config['store']
        self.connection = Connection(host=db_conf['host'],
                                     port=db_conf['port'])
        self.connection.drop_database(db_conf['database'])
        self.db = self.connection[db_conf['database']]
        self.monitoring = self.db[db_conf['monitoring_collection']]
        self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
        self.db[db_conf['gridfs_collection'] + '.files'].drop()
        self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
        self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
                                   database=db_conf['database'],
                                   collection=db_conf['analysis_collection'])
        self.store = MongoDBStore(**db_conf)

    def test_bigrams_should_return_correct_score(self):
        tokens = nltk.corpus.genesis.words('english-web.txt')
        bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
        expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which')
        bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank']
        result = bigram_rank[0][1][0]
        self.assertEqual(result, expected)

    def test_worker_output_should_be_pickleable(self):
        """The workers run under multiprocessing, so their result is
        pickled. This is a regression test."""
        tokens = nltk.corpus.genesis.words('english-web.txt')
        result = Bigrams().process({'tokens':tokens})
        # This should not raise an exception.
        cPickle.dumps(result)

    def test_saving_worker_output_should_work(self):
        """Saving the worker output should work. This is a regression test."""
        self._prepare_store()
        tokens = nltk.corpus.genesis.words('english-web.txt')[:100]
        result = Bigrams().process({'tokens': tokens})
        info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'Bigrams',
                'worker_requires': ['tokens'], 'worker_result': result}
        self.store.save(info)
        self.connection.drop_database(self.db)
class TestMongoStore(unittest.TestCase):
    def setUp(self):
        self.db_conf = db_conf = default_config['store']
        self.connection = Connection(host=db_conf['host'],
                                     port=db_conf['port'])
        self.connection.drop_database(db_conf['database'])
        self.db = self.connection[db_conf['database']]
        self.monitoring = self.db[db_conf['monitoring_collection']]
        self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
        self.db[db_conf['gridfs_collection'] + '.files'].drop()
        self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
        self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
                                   database=db_conf['database'],
                                   collection=db_conf['analysis_collection'])
        self.store = MongoDBStore(**db_conf)

    def tearDown(self):
        self.connection.drop_database(self.db)

    def test_retrieve_should_raise_ValueError_if_missing_id(self):
        object_id = ObjectId("5089d72b7af8d6fc1a5a7b91")
        # Extractor worker needs '_id'
        info = {'data': {'id': 123}, 'worker': 'Extractor',
                'worker_requires': []}
        with self.assertRaises(ValueError):
            self.store.retrieve(info)
        info['data']['_id'] = object_id # fix it
        with self.assertRaises(gridfs.NoFile):
            # no problem with ['data']['id'] but with gridfs file
            self.store.retrieve(info)

        # other workers must have 'id'
        info = {'data': {'_id': object_id}, 'worker': 'other',
                'worker_requires': []}
        with self.assertRaises(ValueError):
            self.store.retrieve(info)
        info['data']['id'] = 123 # fix it
        self.store.retrieve(info) # no problem

    def test_retrieve_from_Extractor_should_return_file_on_gridfs(self):
        start_datetime = datetime.datetime.utcnow()
        data = 'This is just a test.\nPython rules. Álvaro Justen.\n'
        my_file = self.gridfs.new_file(filename='spam.txt')
        my_file.write(data)
        my_file.close()
        after_datetime = datetime.datetime.utcnow()

        info = {'data': {'id': 456, '_id': my_file._id}, 'worker': 'Extractor',
                'worker_requires': []}
        result = self.store.retrieve(info)
        self.assertIn('upload_date', result)
        self.assertIn('length', result)
        self.assertIn('filename', result)
        self.assertIn('contents', result)
        self.assertIn('md5', result)

        self.assertTrue(result['upload_date'] > start_datetime)
        self.assertTrue(result['upload_date'] < after_datetime)
        self.assertEqual(result['length'], len(data))
        self.assertIn(result['filename'], 'spam.txt')
        self.assertIn(result['contents'], data)
        self.assertIn(result['md5'], md5.md5(data).hexdigest())

    def test_retrieve_from_other_workers_should_return_info_from_mongodict(self):
        data = random_string()
        self.mongodict['id:789:property_1'] = data

        info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'NotExtractor',
                'worker_requires': ['property_1']}
        result = self.store.retrieve(info)
        expected = {'property_1': data, '_missing': []}
        self.assertEquals(result, expected)

    def test_retrieve_from_other_workers_should_not_return_keys_that_dont_exist(self):
        data_1, data_2 = random_string(), random_string()
        self.mongodict['id:789:property_1'] = data_1
        self.mongodict['id:789:property_4'] = data_2

        info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'NotExtractor',
                'worker_requires': ['property_2', 'property_3', 'property_4']}
        result = self.store.retrieve(info)
        expected = {'property_4': data_2,
                    '_missing': ['property_2', 'property_3']}
        self.assertEquals(result, expected)

    def test_save_expect_data_id(self):
        info = {'data': {}, 'worker': 'SomeWorker', 'worker_requires': [],
                'worker_result': {}}
        with self.assertRaises(ValueError):
            self.store.save(info)

    def test_save_must_save_every_key_on_worker_result_in_mongodict(self):
        info = {'data': {'id': 42}, 'worker': 'SomeWorker',
                'worker_requires': [],
                'worker_result': {'this': 'is', 'a': 'test'}}
        self.store.save(info)
        self.assertIn('id:42:this', self.mongodict)
        self.assertIn('id:42:a', self.mongodict)
        self.assertIn('id:42:_properties', self.mongodict)
        self.assertEqual(self.mongodict['id:42:this'], 'is')
        self.assertEqual(self.mongodict['id:42:a'], 'test')
        self.assertEqual(set(self.mongodict['id:42:_properties']),
                         set(['this', 'a']))

        # if there are properties, should add to _properties list
        more_info = {'data': {'id': 42}, 'worker': 'OtherWorker',
                     'worker_requires': [],
                     'worker_result': {'spam': 123, 'eggs': 3.14, 'this': 'a'}}
        self.store.save(more_info)
        self.assertIn('id:42:this', self.mongodict)
        self.assertIn('id:42:a', self.mongodict)
        self.assertIn('id:42:spam', self.mongodict)
        self.assertIn('id:42:eggs', self.mongodict)
        self.assertIn('id:42:_properties', self.mongodict)
        self.assertEqual(self.mongodict['id:42:this'], 'a')
        self.assertEqual(self.mongodict['id:42:a'], 'test')
        self.assertEqual(self.mongodict['id:42:spam'], 123)
        self.assertEqual(self.mongodict['id:42:eggs'], 3.14)
        self.assertEqual(set(self.mongodict['id:42:_properties']),
                         set(['this', 'a', 'spam', 'eggs']))
        self.assertEqual(len(self.mongodict['id:42:_properties']), 4)

    def test_save_monitoring_information_should_just_add_info_to_a_collection(self):
        self.assertEqual(self.monitoring.count(), 0)
        self.store.save_monitoring(monitoring_sample)
        self.assertEqual(self.monitoring.count(), 1)
        sample = self.monitoring.find_one()

    def test_save_must_append_exceptions_to_a_list(self):
        info = {'data': {'id': 42}, 'worker': 'SomeWorker',
                'worker_requires': [],
                'worker_result': {'_exception': 'ERROR'}}
        self.store.save(info)
        self.assertIn('id:42:_exception', self.mongodict)
        self.assertEqual(self.mongodict['id:42:_exception'],
                [{'worker': 'SomeWorker', 'traceback': 'ERROR'}])

        info_2 = {'data': {'id': 42}, 'worker': 'SomeOtherWorker',
                  'worker_requires': [],
                  'worker_result': {'_exception': 'ERROR 2'}}

        self.store.save(info_2)
        self.assertIn('id:42:_exception', self.mongodict)
        self.assertEqual(self.mongodict['id:42:_exception'],
                [{'worker': 'SomeWorker', 'traceback': 'ERROR'}, {'worker':
                    'SomeOtherWorker', 'traceback': 'ERROR 2'}])

    def test_exceptions_should_be_turned_into_a_list_if_it_already_exists(self):
        self.mongodict['id:42:_exception'] = 'PREVIOUS ERROR'

        info = {'data': {'id': 42}, 'worker': 'SomeWorker',
                'worker_requires': [],
                'worker_result': {'_exception': 'ERROR'}}
        self.store.save(info)

        self.assertIn('id:42:_exception', self.mongodict)
        self.assertEqual(self.mongodict['id:42:_exception'],
                [
                    'PREVIOUS ERROR',
                    {'worker': 'SomeWorker', 'traceback': 'ERROR'}
                ])