Beispiel #1
0
    def test_task_check_if_extract(self):
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:

            message = {'bibcode': 'fta', 'provider': 'MNRAS',
                       'ft_source': '{}/tests/test_integration/stub_data/full_test.txt'.format(self.proj_home)}
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'bibcode': 'fta', 'file_format': 'txt',
                        #'index_date': '2017-06-30T22:45:47.800112Z',
                        'UPDATE': 'NOT_EXTRACTED_BEFORE',
                        'meta_path': u'{}/ft/a/meta.json'.format(self.app.conf['FULLTEXT_EXTRACT_PATH']),
                        'ft_source': '{}/tests/test_integration/stub_data/full_test.txt'.format(self.proj_home),
                        'provider': 'MNRAS'}
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)


        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:

            message = {'bibcode': 'fta', 'provider': 'MNRAS',
                       'ft_source': '{}/tests/test_integration/stub_data/full_test.pdf'.format(self.proj_home)}
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)

            expected = {'bibcode': 'fta', 'file_format': 'pdf',
                        #'index_date': '2017-06-30T22:45:47.800112Z',
                        'UPDATE': 'NOT_EXTRACTED_BEFORE',
                        'meta_path': u'{}/ft/a/meta.json'.format(self.app.conf['FULLTEXT_EXTRACT_PATH']),
                        'ft_source': '{}/tests/test_integration/stub_data/full_test.pdf'.format(self.proj_home),
                        'provider': 'MNRAS'}
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)
    def test_task_check_if_extract(self):
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:

            message = {'bibcode': 'fta', 'provider': 'MNRAS',
                       'ft_source': '{}/tests/test_integration/stub_data/full_test.txt'.format(self.proj_home)}
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'bibcode': 'fta', 'file_format': 'txt',
                        #'index_date': '2017-06-30T22:45:47.800112Z',
                        'UPDATE': 'NOT_EXTRACTED_BEFORE',
                        'meta_path': u'{}/ft/a/meta.json'.format(self.app.conf['FULLTEXT_EXTRACT_PATH']),
                        'ft_source': '{}/tests/test_integration/stub_data/full_test.txt'.format(self.proj_home),
                        'provider': 'MNRAS'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)


        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:

            message = {'bibcode': 'fta', 'provider': 'MNRAS',
                       'ft_source': '{}/tests/test_integration/stub_data/full_test.pdf'.format(self.proj_home)}
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)

            expected = {'bibcode': 'fta', 'file_format': 'pdf',
                        #'index_date': '2017-06-30T22:45:47.800112Z',
                        'UPDATE': 'NOT_EXTRACTED_BEFORE',
                        'meta_path': u'{}/ft/a/meta.json'.format(self.app.conf['FULLTEXT_EXTRACT_PATH']),
                        'ft_source': '{}/tests/test_integration/stub_data/full_test.pdf'.format(self.proj_home),
                        'provider': 'MNRAS'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)
Beispiel #3
0
    def test_forced_extraction_and_forced_send(self):
        """
        Tests that when a user specifies 'force_extract' and 'force_send' that
        the full text is extracted regardless of its underlying reason for being
        or not being extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=True,
                                       force_send=True)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'FORCE_TO_EXTRACT',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)
    def test_forced_extraction_and_forced_send(self):
        """
        Tests that when a user specifies 'force_extract' and 'force_send' that
        the full text is extracted regardless of its underlying reason for being
        or not being extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=True, force_send=True)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'UPDATE': 'FORCE_TO_EXTRACT',
                         'bibcode': 'test4',
                         'file_format': 'txt',
                         'ft_source': '{}/tests/test_unit/stub_data/test.txt'.format(self.app.conf['PROJ_HOME']),
                         #'index_date': '2017-07-07T14:39:11.271432Z',
                         'meta_path': '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(self.app.conf['PROJ_HOME']),
                         'provider': 'TEST'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)
Beispiel #5
0
    def test_extra_acknowledment(self):
        """
        Submits a file to the RabbitMQ that contains a bibcode that should
        result in an acknowlegements file is created. It checks that this file
        is created and then removes all the content created by the tests.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=False, force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'UPDATE': 'NOT_EXTRACTED_BEFORE',
                         'bibcode': 'test1',
                         'file_format': 'xml',
                         'ft_source': '{}/tests/test_integration/stub_data/full_test_elsevier.xml'.format(self.app.conf['PROJ_HOME']),
                         #'index_date': '2017-07-07T14:39:11.271432Z',
                         'meta_path': '{}/tests/test_unit/stub_data/te/st/1/meta.json'.format(self.app.conf['PROJ_HOME']),
                         'provider': 'Elsevier'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities, 'delay', return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                tasks.task_extract(actual)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(
                os.path.exists(meta_path),
                'Meta file not created: {0}'.format(path)
            )

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content)
                )

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(
                os.path.exists(fulltext_path),
                'Full text file not created: %s'.format(path)
            )

            # unless changed, tests/test_integration/stub_data/full_test_elsevier.xml
            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                self.assertEqual(fulltext_content,
                                 '1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX')

            acknowledgments_path = os.path.join(path, 'acknowledgements.txt')
            self.assertTrue(
                os.path.exists(acknowledgments_path),
                'Full text file not created: %s'.format(path)
            )

            if os.path.exists(acknowledgments_path):
                with open(acknowledgments_path, 'r') as acknowledgments_file:
                    acknowledgements_content = acknowledgments_file.read()
                self.assertEqual(acknowledgements_content,
                                 "Acknowledgments THANK YOU")
Beispiel #6
0
    def test_extraction_when_there_is_no_previous_fulltext_file(self):
        """
        Tests the scenario that the bibcode received has content on disk, but
        has not had any full text extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=False,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        test_meta_content = {
            'index_date': datetime.utcnow().isoformat() + 'Z',
            'bibcode': 'test4',
            'provider': 'mnras'
        }

        with open(self.test_expected, 'w') as test_meta_file:
            json.dump(test_meta_content, test_meta_file)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'MISSING_FULL_TEXT',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            tasks.task_extract(actual)
            self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'MISSING_FULL_TEXT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                self.assertEqual(fulltext_content,
                                 "Introduction THIS IS AN INTERESTING TITLE")
Beispiel #7
0
    def test_extraction_of_non_extracted(self):
        """
        Publishes a packet that contains a bibcode that has a full text content
        path that differs to the one that was used the previous time full text
        content was extracted. Then it ensures all the files generated are
        removed.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=False,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'NOT_EXTRACTED_BEFORE',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            tasks.task_extract(actual)
            self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                self.assertEqual(fulltext_content,
                                 "Introduction THIS IS AN INTERESTING TITLE")
    def test_extra_acknowledment(self):
        """
        Submits a file to the RabbitMQ that contains a bibcode that should
        result in an acknowlegements file is created. It checks that this file
        is created and then removes all the content created by the tests.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=False, force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'UPDATE': 'NOT_EXTRACTED_BEFORE',
                         'bibcode': 'test1',
                         'file_format': 'xml',
                         'ft_source': '{}/tests/test_integration/stub_data/full_test_elsevier.xml'.format(self.app.conf['PROJ_HOME']),
                         #'index_date': '2017-07-07T14:39:11.271432Z',
                         'meta_path': '{}/tests/test_unit/stub_data/te/st/1/meta.json'.format(self.app.conf['PROJ_HOME']),
                         'provider': 'Elsevier'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            tasks.task_extract(actual)
            self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(
                os.path.exists(meta_path),
                'Meta file not created: {0}'.format(path)
            )

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content)
                )

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(
                os.path.exists(fulltext_path),
                'Full text file not created: %s'.format(path)
            )

            # unless changed, tests/test_integration/stub_data/full_test_elsevier.xml
            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                self.assertEqual(fulltext_content,
                                 '\n\napplication/xml\nJOURNAL TITLE\nCREATOR\n\n\nSUBJECT\n\nDESCRIPTION\nJOURNAL\nNAME\nCOPYRIGHT\nPUBLISHER\n9999-9999\nVOLUME\nDAY MONTH YEAR\n1999-99-99\n999-999\n999\n999\n99.9999/9.99999.9999.99.999\nhttp://dx.doi.org/99.9999/9.99999.9999.99.999\ndoi:99.9999/9.99999.9999.99.999\n\n\nJournals\nS300.1\n\n\n\nJOURNAL\n999999\n99999-9999(99)99999-9\n99.9999/9.99999.9999.99.999\nCOPYRIGHT\n\n\n\nFig.1\n\n\n CONTENT\n \n\n\n\n\n\nTITLE\n\n\nGIVEN NAME\nSURNAME\n\na\n\n\n#x204e;\n\[email protected]\n\na\nAFFILIATION\n#x204e;\nAUTHOR\n\n\n\n\n\n\nAbstract\nABSTRACT\n\n\n\nHighlights\n\nHIGHLIGHTS\n\n\nKeywords\n\nKEYWORD\n\n\n\n\n1\nIntroduction\nJOURNAL CONTENT\n\n\n\nAcknowledgments\nTHANK YOU\n\n\nAppendix A\nAPPENDIX TITLE\nAPPENDIX\n\n\n\n\n\nReferences\n\nAUTHOR et al., 1999\n\n\n\n\nGIVEN NAME\nSURNAME\n\n\n\nTITLE\n\n\n\n\n\n\nTITLE\n \n\nVOLUME\n\nYEAR\n\n\n99\n99\n\n\n\n\n\n\n\n\n\n')

            acknowledgments_path = os.path.join(path, 'acknowledgements.txt')
            self.assertTrue(
                os.path.exists(acknowledgments_path),
                'Full text file not created: %s'.format(path)
            )

            if os.path.exists(acknowledgments_path):
                with open(acknowledgments_path, 'r') as acknowledgments_file:
                    acknowledgements_content = acknowledgments_file.read()
                self.assertEqual(acknowledgements_content,
                                 "\nAcknowledgments\nTHANK YOU\n")
    def test_forced_extraction(self):
        """
        Tests that when a user specifies 'force_extract' that the full text
        is extracted regardless of its underlying reason for being or not
        being extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=True, force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'UPDATE': 'FORCE_TO_EXTRACT',
                         'bibcode': 'test4',
                         'file_format': 'txt',
                         'ft_source': '{}/tests/test_unit/stub_data/test.txt'.format(self.app.conf['PROJ_HOME']),
                         #'index_date': '2017-07-07T14:39:11.271432Z',
                         'meta_path': '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(self.app.conf['PROJ_HOME']),
                         'provider': 'TEST'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            tasks.task_extract(actual)
            self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(
                os.path.exists(meta_path),
                'Meta file not created: {0}'.format(path)
            )

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'FORCE_TO_EXTRACT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content)
                )

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(
                os.path.exists(fulltext_path),
                'Full text file not created: %s'.format(path)
            )

            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                self.assertEqual(fulltext_content, "Introduction THIS IS AN INTERESTING TITLE")
Beispiel #10
0
    def test_forced_extraction(self):
        """
        Tests that when a user specifies 'force_extract' that the full text
        is extracted regardless of its underlying reason for being or not
        being extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=True,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'FORCE_TO_EXTRACT',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities,
                              'delay',
                              return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                tasks.task_extract(actual)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'FORCE_TO_EXTRACT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt.gz')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                fulltext_content = reader.read_file(fulltext_path,
                                                    json_format=False)
                self.assertEqual(fulltext_content,
                                 "Introduction THIS IS AN INTERESTING TITLE")
    def test_extraction_when_there_is_no_previous_fulltext_file(self):
        """
        Tests the scenario that the bibcode received has content on disk, but
        has not had any full text extracted.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=False, force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        test_meta_content = {
            'index_date': datetime.utcnow().isoformat()+'Z',
            'bibcode': 'test4',
            'provider': 'mnras'
        }

        with open(self.test_expected, 'w') as test_meta_file:
            json.dump(test_meta_content, test_meta_file)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {'UPDATE': 'MISSING_FULL_TEXT',
                         'bibcode': 'test4',
                         'file_format': 'txt',
                         'ft_source': '{}/tests/test_unit/stub_data/test.txt'.format(self.app.conf['PROJ_HOME']),
                         #'index_date': '2017-07-07T14:39:11.271432Z',
                         'meta_path': '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(self.app.conf['PROJ_HOME']),
                         'provider': 'TEST'}
            actual = task_extract.call_args[0][0]
            self.assertDictContainsSubset(expected, actual)
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            tasks.task_extract(actual)
            self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(
                os.path.exists(meta_path),
                'Meta file not created: {0}'.format(path)
            )

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'MISSING_FULL_TEXT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content)
                )

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(
                os.path.exists(fulltext_path),
                'Full text file not created: %s'.format(path)
            )

            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                self.assertEqual(fulltext_content, "Introduction THIS IS AN INTERESTING TITLE")
    def test_full_range_of_file_format_extraction(self):
        """
        Submits a file containing all the relevant document types to the
        RabbitMQ instance. Runs all the relevant workers, and then checks that
        content was extracted. Finally, it cleans up any files or paths created.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        if self.grobid_service is not None:
            httpretty.enable()
            expected_grobid_fulltext = "<hello/>"
            httpretty.register_uri(httpretty.POST, self.grobid_service,
                           body=expected_grobid_fulltext,
                           status=200)

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish, force_extract=False, force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 6)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay', return_value=None) as task_extract:
            extraction_arguments_set = []
            expected_update = 'NOT_EXTRACTED_BEFORE'
            for message in records.payload:
                tasks.task_check_if_extract(message)
                self.assertTrue(task_extract.called)
                actual = task_extract.call_args[0][0]
                self.assertEqual(actual['UPDATE'], expected_update,
                        'This should be %s, but is in fact: {0}'
                        .format(expected_update, actual['UPDATE']))
                extraction_arguments_set.append(actual)

        with patch.object(tasks.task_output_results, 'delay', return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            for arguments in extraction_arguments_set:
                #if arguments['ft_source'].endswith('.pdf') is False:
                tasks.task_extract(arguments)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for i, path in enumerate(self.expected_paths):
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(
                os.path.exists(meta_path),
                'Meta file not created: {0}'.format(path)
            )

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content)
                )

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(
                os.path.exists(fulltext_path),
                'Full text file not created: %s'.format(path)
            )

            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                expected_fulltext_content = (
                        u"Introduction THIS IS AN INTERESTING TITLE",
                        u"Introduction THIS IS AN INTERESTING TITLE",
                        u"I.INTRODUCTION INTRODUCTION GOES HERE Manual Entry\nAPPENDIX: APPENDIX TITLE GOES HERE APPENDIX CONTENT",
                        'application/xml JOURNAL TITLE CREATOR SUBJECT DESCRIPTION JOURNAL NAME COPYRIGHT PUBLISHER 9999-9999 VOLUME DAY MONTH YEAR 1999-99-99 999-999 999 999 99.9999/9.99999.9999.99.999 http://dx.doi.org/99.9999/9.99999.9999.99.999 doi:99.9999/9.99999.9999.99.999 Journals S300.1 JOURNAL 999999 99999-9999(99)99999-9 99.9999/9.99999.9999.99.999 COPYRIGHT Fig.1 CONTENT TITLE GIVEN NAME SURNAME a #x204e; [email protected] a AFFILIATION #x204e; AUTHOR Abstract ABSTRACT Highlights HIGHLIGHTS Keywords KEYWORD 1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX References AUTHOR et al., 1999 GIVEN NAME SURNAME TITLE TITLE VOLUME YEAR 99 99',

                        u"No Title AA 999, 999-999 (1999) DOI: 99.9999/9999-9999:99999999 TITLE AUTHOR AFFILIATION Received 99 MONTH 1999 / Accepted 99 MONTH 1999 Abstract ABSTRACT Key words: KEYWORD INTRODUCTION SECTION Table 1: TABLE TABLE (1) COPYRIGHT",
                        #u"Introduction\nTHIS IS AN INTERESTING TITLE\n", # PDFBox
                        u"Introduction THIS IS AN INTERESTING TITLE", # pdftotext
                        )

                self.assertEqual(fulltext_content, expected_fulltext_content[i])

            grobid_fulltext_path = os.path.join(path, 'grobid_fulltext.xml')
            if os.path.exists(grobid_fulltext_path):
                with open(grobid_fulltext_path, 'r') as grobid_fulltext_file:
                    grobid_fulltext_content = grobid_fulltext_file.read()
                self.assertEqual(grobid_fulltext_content, expected_grobid_fulltext)
Beispiel #13
0
    def test_stale_content(self):
        """
        Tests the scenario that the file on disk has stale content, and so it
        extracts the new full text and writes it to disk. The test uses a live
        RabbitMQ instance to test the correct interactions of the pipeline with
        RabbitMQ.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=False,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 1)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        test_meta_content = {
            'index_date': datetime.utcnow().isoformat() + 'Z',
            'bibcode': self.bibcode,
            'provider': self.provider,
            'ft_source': self.ft_source
        }

        with open(self.test_expected.replace('meta.json', 'fulltext.txt.gz'), 'w')\
                as test_full_text_file:
            test_full_text_file.write('Full text content')

        time.sleep(2)
        with open(self.test_expected, 'w') as test_meta_file:
            json.dump(test_meta_content, test_meta_file)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            message = records.payload[0]
            tasks.task_check_if_extract(message)
            self.assertTrue(task_extract.called)
            expected = {
                'UPDATE':
                'STALE_CONTENT',
                'bibcode':
                'test4',
                'file_format':
                'txt',
                'ft_source':
                '{}/tests/test_unit/stub_data/test.txt'.format(
                    self.app.conf['PROJ_HOME']),
                #'index_date': '2017-07-07T14:39:11.271432Z',
                'meta_path':
                '{}/tests/test_unit/stub_data/te/st/4/meta.json'.format(
                    self.app.conf['PROJ_HOME']),
                'provider':
                'TEST'
            }
            actual = task_extract.call_args[0][0]
            self.assertTrue(set(expected).issubset(actual))
            self.assertTrue('index_date' in actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            with patch.object(tasks.task_identify_facilities,
                              'delay',
                              return_value=None) as task_identify_facilities:
                # Now we do call the extraction task with the proper arguments
                tasks.task_extract(actual)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for path in self.expected_paths:
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'STALE_CONTENT' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt.gz')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                fulltext_content = reader.read_file(fulltext_path,
                                                    json_format=False)
                self.assertEqual(fulltext_content,
                                 "Introduction THIS IS AN INTERESTING TITLE")
Beispiel #14
0
    def test_full_range_of_file_format_extraction(self):
        """
        Submits a file containing all the relevant document types to the
        RabbitMQ instance. Runs all the relevant workers, and then checks that
        content was extracted. Finally, it cleans up any files or paths created.

        :return: no return
        """
        sys.path.append(self.app.conf['PROJ_HOME'])
        from run import read_links_from_file

        if self.grobid_service is not None:
            httpretty.enable()
            expected_grobid_fulltext = "<hello/>"
            httpretty.register_uri(httpretty.POST,
                                   self.grobid_service,
                                   body=expected_grobid_fulltext,
                                   status=200)

        # User loads the list of full text files and publishes them to the
        # first queue
        records = read_links_from_file(self.test_publish,
                                       force_extract=False,
                                       force_send=False)

        self.helper_get_details(self.test_publish)
        self.assertEqual(
            len(records.bibcode), self.nor,
            'The number of records should match'
            ' the number of lines. It does not: '
            '{0} [{1}]'.format(len(records.bibcode), self.nor))

        self.assertTrue(len(records.payload) == 6)

        # Make the fake data to use
        if not os.path.exists(self.meta_path):
            os.makedirs(self.meta_path)

        # Call the task to check if it should be extracted but mock the extraction task
        with patch.object(tasks.task_extract, 'delay',
                          return_value=None) as task_extract:
            extraction_arguments_set = []
            expected_update = 'NOT_EXTRACTED_BEFORE'
            for message in records.payload:
                tasks.task_check_if_extract(message)
                self.assertTrue(task_extract.called)
                actual = task_extract.call_args[0][0]
                self.assertEqual(
                    actual['UPDATE'], expected_update,
                    'This should be %s, but is in fact: {0}'.format(
                        expected_update, actual['UPDATE']))
                extraction_arguments_set.append(actual)

        with patch.object(tasks.task_output_results,
                          'delay',
                          return_value=None) as task_output_results:
            # Now we do call the extraction task with the proper arguments
            for arguments in extraction_arguments_set:
                #if arguments['ft_source'].endswith('.pdf') is False:
                tasks.task_extract(arguments)
                self.assertTrue(task_output_results.called)

        # After the extractor, the meta writer should write all the payloads to
        # disk in the correct folders
        for i, path in enumerate(self.expected_paths):
            meta_path = os.path.join(path, 'meta.json')

            self.assertTrue(os.path.exists(meta_path),
                            'Meta file not created: {0}'.format(path))

            if os.path.exists(meta_path):
                with open(meta_path, 'r') as meta_file:
                    meta_content = meta_file.read()
                self.assertTrue(
                    'NOT_EXTRACTED_BEFORE' in meta_content,
                    'meta file does not contain the right extract keyword: {0}'
                    .format(meta_content))

            fulltext_path = os.path.join(path, 'fulltext.txt')
            self.assertTrue(os.path.exists(fulltext_path),
                            'Full text file not created: %s'.format(path))

            if os.path.exists(fulltext_path):
                with open(fulltext_path, 'r') as fulltext_file:
                    fulltext_content = fulltext_file.read()
                expected_fulltext_content = (
                    u"Introduction THIS IS AN INTERESTING TITLE",
                    u"Introduction THIS IS AN INTERESTING TITLE",
                    u"I. INTRODUCTION INTRODUCTION GOES HERE Manual Entry TABLE I. TEXT a NOTES a TEXT\nAPPENDIX: APPENDIX TITLE GOES HERE APPENDIX CONTENT",
                    u'1 Introduction JOURNAL CONTENT Acknowledgments THANK YOU Appendix A APPENDIX TITLE APPENDIX',
                    u"No Title AA 999, 999-999 (1999) DOI: 99.9999/9999-9999:99999999 TITLE AUTHOR AFFILIATION Received 99 MONTH 1999 / Accepted 99 MONTH 1999 Abstract ABSTRACT Key words: KEYWORD INTRODUCTION SECTION Table 1: TABLE TABLE (1) COPYRIGHT",
                    #u"Introduction\nTHIS IS AN INTERESTING TITLE\n", # PDFBox
                    u"Introduction THIS IS AN INTERESTING TITLE",  # pdftotext
                )

                self.assertEqual(fulltext_content,
                                 expected_fulltext_content[i])

            grobid_fulltext_path = os.path.join(path, 'grobid_fulltext.xml')
            if os.path.exists(grobid_fulltext_path):
                with open(grobid_fulltext_path, 'r') as grobid_fulltext_file:
                    grobid_fulltext_content = grobid_fulltext_file.read()
                self.assertEqual(grobid_fulltext_content,
                                 expected_grobid_fulltext)