def test_pipeline_extract_content_extracts_meta_text_correctly(self): """ Tests the extract_content method. Checks that the meta.json file written to disk contains the content that we expect to be there. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ self.dict_item['file_format'] = 'txt' pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload, 1) meta_dict = {} with open(self.dict_item['meta_path'], 'r') as meta_file: meta_dict = json.load(meta_file) self.assertEqual(self.dict_item['ft_source'], meta_dict['ft_source']) self.assertEqual(self.dict_item['bibcode'], meta_dict['bibcode']) self.assertEqual(self.dict_item['provider'], meta_dict['provider']) self.assertEqual(self.dict_item['UPDATE'], meta_dict['UPDATE'])
def pipeline_extract(self, format_): """ Helper function that writes a meta.json and checks that the content on disk matches what we expect to be there. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :param format_: file format to be in the meta.json :return: no return """ self.dict_item['file_format'] = format_ pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload == '["MNRAS2014"]') meta_dict = {} with open(self.dict_item['meta_path'], 'r') as meta_file: meta_dict = json.load(meta_file) self.assertEqual(self.dict_item['ft_source'], meta_dict['ft_source']) self.assertEqual(self.dict_item['bibcode'], meta_dict['bibcode']) self.assertEqual(self.dict_item['provider'], meta_dict['provider']) self.assertEqual(self.dict_item['UPDATE'], meta_dict['UPDATE'])
def test_pipeline_extract_content_extracts_fulltext_correctly(self): """ Tests the extract_content method. Checks that the full text written to disk matches the ful text that we expect to be written to disk. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ self.dict_item['file_format'] = 'txt' pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload, 1) full_text = '' with open( self.dict_item['meta_path'] .replace('meta.json', 'fulltext.txt'), 'r' ) as full_text_file: full_text = full_text_file.read() self.assertEqual(self.dict_item['fulltext'], full_text)
def test_pipeline_extract_content_extracts_fulltext_correctly(self): """ Tests the extract_content method. Checks that the full text written to disk matches the ful text that we expect to be written to disk. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ self.dict_item['file_format'] = 'txt' pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload, 1) full_text = '' with open( self.dict_item['meta_path'].replace('meta.json', 'fulltext.txt'), 'r') as full_text_file: full_text = full_text_file.read() self.assertEqual(self.dict_item['fulltext'], full_text)
def test_temporary_file_is_made_and_moved(self): """ Tests the extract_content method. Checks that when the worker writes to disk, that it first generates a temporary output file, and then moves that file to the expected output name. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ writer.extract_content([self.dict_item]) os.remove(self.meta_file) temp_path = self.meta_file.replace('meta.json', '') temp_file_name = writer.write_to_temp_file(self.dict_item, temp_path) self.assertTrue(os.path.exists(temp_file_name)) writer.move_temp_file_to_file(temp_file_name, self.meta_file) self.assertFalse(os.path.exists(temp_file_name)) self.assertTrue(os.path.exists(self.meta_file))
def test_write_worker_returns_content(self): """ Tests the extract_content method. Checks that the payload that the worker returns, that will go on to another RabbitMQ queue, is in the format that we expect. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ payload = writer.extract_content([self.dict_item]) self.assertTrue(payload == '["MNRAS2014"]', 'Length does not match: {0}'.format(payload))
def test_write_worker_returns_content(self): """ Tests the extract_content method. Checks that the payload that the worker returns, that will go on to another RabbitMQ queue, is in the format that we expect. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ payload = writer.extract_content([self.dict_item]) self.assertTrue( payload == '["MNRAS2014"]', 'Length does not match: {0}' .format(payload) )
def pipeline_extract(self, format_): """ Helper function that writes a meta.json and checks that the content on disk matches what we expect to be there. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :param format_: file format to be in the meta.json :return: no return """ self.dict_item['file_format'] = format_ pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload == '["MNRAS2014"]') meta_dict = {} with open(self.dict_item['meta_path'], 'r') as meta_file: meta_dict = json.load(meta_file) self.assertEqual( self.dict_item['ft_source'], meta_dict['ft_source'] ) self.assertEqual( self.dict_item['bibcode'], meta_dict['bibcode'] ) self.assertEqual( self.dict_item['provider'], meta_dict['provider'] ) self.assertEqual( self.dict_item['UPDATE'], meta_dict['UPDATE'] )
def test_acknowledgements_file_is_created(self): """ Tests the extract_content method. Checks that both a fulltext.txt and a acknowledgements.txt file is created (if there is actual content for the acknowledgements). N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ self.dict_item['acknowledgements'] = "Thank you" return_payload = writer.extract_content([self.dict_item]) self.assertTrue(os.path.exists(self.full_text_file), msg=os.path.exists(self.full_text_file)) self.assertTrue(os.path.exists(self.acknowledgement_file), msg=os.path.exists(self.acknowledgement_file))
def test_pipeline_extract_content_extracts_meta_text_correctly(self): """ Tests the extract_content method. Checks that the meta.json file written to disk contains the content that we expect to be there. N. B. Do not let the name extract_content portray anything. It is simply to keep the same naming convention as the other workers. extract_content is the main method the worker will run. :return: no return """ self.dict_item['file_format'] = 'txt' pipeline_payload = [self.dict_item] return_payload = writer.extract_content(pipeline_payload) self.assertTrue(return_payload, 1) meta_dict = {} with open(self.dict_item['meta_path'], 'r') as meta_file: meta_dict = json.load(meta_file) self.assertEqual( self.dict_item['ft_source'], meta_dict['ft_source'] ) self.assertEqual( self.dict_item['bibcode'], meta_dict['bibcode'] ) self.assertEqual( self.dict_item['provider'], meta_dict['provider'] ) self.assertEqual( self.dict_item['UPDATE'], meta_dict['UPDATE'] )