예제 #1
0
    def test_pipeline_extract_content_extracts_meta_text_correctly(self):
        """
        Tests the extract_content method. Checks that the meta.json file written
        to disk contains the content that we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(self.dict_item['ft_source'], meta_dict['ft_source'])
        self.assertEqual(self.dict_item['bibcode'], meta_dict['bibcode'])
        self.assertEqual(self.dict_item['provider'], meta_dict['provider'])
        self.assertEqual(self.dict_item['UPDATE'], meta_dict['UPDATE'])
예제 #2
0
    def pipeline_extract(self, format_):
        """
        Helper function that writes a meta.json and checks that the content on
        disk matches what we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :param format_: file format to be in the meta.json
        :return: no return
        """

        self.dict_item['file_format'] = format_
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload == '["MNRAS2014"]')

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(self.dict_item['ft_source'], meta_dict['ft_source'])
        self.assertEqual(self.dict_item['bibcode'], meta_dict['bibcode'])
        self.assertEqual(self.dict_item['provider'], meta_dict['provider'])
        self.assertEqual(self.dict_item['UPDATE'], meta_dict['UPDATE'])
예제 #3
0
    def test_pipeline_extract_content_extracts_fulltext_correctly(self):
        """
        Tests the extract_content method. Checks that the full text written to
        disk matches the ful text that we expect to be written to disk.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        full_text = ''
        with open(
                self.dict_item['meta_path']
                        .replace('meta.json', 'fulltext.txt'), 'r'
        ) as full_text_file:

            full_text = full_text_file.read()

        self.assertEqual(self.dict_item['fulltext'], full_text)
예제 #4
0
    def test_pipeline_extract_content_extracts_fulltext_correctly(self):
        """
        Tests the extract_content method. Checks that the full text written to
        disk matches the ful text that we expect to be written to disk.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        full_text = ''
        with open(
                self.dict_item['meta_path'].replace('meta.json',
                                                    'fulltext.txt'),
                'r') as full_text_file:

            full_text = full_text_file.read()

        self.assertEqual(self.dict_item['fulltext'], full_text)
예제 #5
0
    def test_temporary_file_is_made_and_moved(self):
        """
        Tests the extract_content method. Checks that when the worker writes to
        disk, that it first generates a temporary output file, and then moves
        that file to the expected output name.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        writer.extract_content([self.dict_item])
        os.remove(self.meta_file)

        temp_path = self.meta_file.replace('meta.json', '')
        temp_file_name = writer.write_to_temp_file(self.dict_item, temp_path)
        self.assertTrue(os.path.exists(temp_file_name))

        writer.move_temp_file_to_file(temp_file_name, self.meta_file)
        self.assertFalse(os.path.exists(temp_file_name))
        self.assertTrue(os.path.exists(self.meta_file))
예제 #6
0
    def test_temporary_file_is_made_and_moved(self):
        """
        Tests the extract_content method. Checks that when the worker writes to
        disk, that it first generates a temporary output file, and then moves
        that file to the expected output name.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        writer.extract_content([self.dict_item])
        os.remove(self.meta_file)

        temp_path = self.meta_file.replace('meta.json', '')
        temp_file_name = writer.write_to_temp_file(self.dict_item, temp_path)
        self.assertTrue(os.path.exists(temp_file_name))

        writer.move_temp_file_to_file(temp_file_name, self.meta_file)
        self.assertFalse(os.path.exists(temp_file_name))
        self.assertTrue(os.path.exists(self.meta_file))
예제 #7
0
    def test_write_worker_returns_content(self):
        """
        Tests the extract_content method. Checks that the payload that the
        worker returns, that will go on to another RabbitMQ queue, is in the
        format that we expect.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        payload = writer.extract_content([self.dict_item])
        self.assertTrue(payload == '["MNRAS2014"]',
                        'Length does not match: {0}'.format(payload))
예제 #8
0
    def test_write_worker_returns_content(self):
        """
        Tests the extract_content method. Checks that the payload that the
        worker returns, that will go on to another RabbitMQ queue, is in the
        format that we expect.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        payload = writer.extract_content([self.dict_item])
        self.assertTrue(
            payload == '["MNRAS2014"]', 'Length does not match: {0}'
            .format(payload)
        )
예제 #9
0
    def pipeline_extract(self, format_):
        """
        Helper function that writes a meta.json and checks that the content on
        disk matches what we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :param format_: file format to be in the meta.json
        :return: no return
        """

        self.dict_item['file_format'] = format_
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload == '["MNRAS2014"]')

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(
            self.dict_item['ft_source'],
            meta_dict['ft_source']
        )
        self.assertEqual(
            self.dict_item['bibcode'],
            meta_dict['bibcode']
        )
        self.assertEqual(
            self.dict_item['provider'],
            meta_dict['provider']
        )
        self.assertEqual(
            self.dict_item['UPDATE'],
            meta_dict['UPDATE']
        )
예제 #10
0
    def test_acknowledgements_file_is_created(self):
        """
        Tests the extract_content method. Checks that both a fulltext.txt and a
        acknowledgements.txt file is created (if there is actual content for the
        acknowledgements).

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['acknowledgements'] = "Thank you"
        return_payload = writer.extract_content([self.dict_item])

        self.assertTrue(os.path.exists(self.full_text_file),
                        msg=os.path.exists(self.full_text_file))
        self.assertTrue(os.path.exists(self.acknowledgement_file),
                        msg=os.path.exists(self.acknowledgement_file))
예제 #11
0
    def test_acknowledgements_file_is_created(self):
        """
        Tests the extract_content method. Checks that both a fulltext.txt and a
        acknowledgements.txt file is created (if there is actual content for the
        acknowledgements).

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['acknowledgements'] = "Thank you"
        return_payload = writer.extract_content([self.dict_item])

        self.assertTrue(os.path.exists(self.full_text_file),
                        msg=os.path.exists(self.full_text_file))
        self.assertTrue(os.path.exists(self.acknowledgement_file),
                        msg=os.path.exists(self.acknowledgement_file))
예제 #12
0
    def test_pipeline_extract_content_extracts_meta_text_correctly(self):
        """
        Tests the extract_content method. Checks that the meta.json file written
        to disk contains the content that we expect to be there.

        N. B.
        Do not let the name extract_content portray anything. It is simply to
        keep the same naming convention as the other workers. extract_content
        is the main method the worker will run.

        :return: no return
        """

        self.dict_item['file_format'] = 'txt'
        pipeline_payload = [self.dict_item]

        return_payload = writer.extract_content(pipeline_payload)

        self.assertTrue(return_payload, 1)

        meta_dict = {}
        with open(self.dict_item['meta_path'], 'r') as meta_file:
            meta_dict = json.load(meta_file)

        self.assertEqual(
            self.dict_item['ft_source'],
            meta_dict['ft_source']
        )
        self.assertEqual(
            self.dict_item['bibcode'],
            meta_dict['bibcode']
        )
        self.assertEqual(
            self.dict_item['provider'],
            meta_dict['provider']
        )
        self.assertEqual(
            self.dict_item['UPDATE'],
            meta_dict['UPDATE']
        )