Esempio n. 1
0
    def test_unibitri_reduction_output_termcounts(
            self, mock_path_isfile, mock_of_makedirs, mock_of_bz2file,
            mock_of_dump, mock_open, mock_utils_bz2file, mock_utils_dump,
            mock_read_pickle):
        fake_df_data = {
            'abstract': ['abstract 1, of the patent with extra stuff']
        }

        mock_of_bz2file.side_effect = bz2file_fake

        self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open,
                            mock_utils_bz2file, mock_path_isfile)
        args = [
            '-ts', '-tc', '-ds', self.data_source_name, '--id_header',
            'patent_id', '--date_header', 'publication_date',
            '--max_document_frequency', '1.0'
        ]

        pygrams.main(args)

        def assert_outputs(term_counts_per_week, feature_names,
                           number_of_documents_per_week, week_iso_dates):
            self.assertListEqual(feature_names,
                                 ['abstract', 'extra stuff', 'patent', 'with'])
            term_counts_as_lists = term_counts_per_week.todense().tolist()
            self.assertListEqual(term_counts_as_lists[0], [1, 1, 1, 1])
            self.assertListEqual(number_of_documents_per_week, [1])
            self.assertListEqual(week_iso_dates, [200052])

        self.assertTimeSeriesOutputs(assert_outputs, mock_of_dump,
                                     mock_of_makedirs)
Esempio n. 2
0
    def test_json_configuration_encoding_maximal(self, mock_open,
                                                 mock_json_dump):
        patent_pickle_file_name = 'USPTO-random-100.pkl.bz2'
        patent_pickle_absolute_file_name = os.path.abspath(
            os.path.join('data', patent_pickle_file_name))
        output_file_name = 'test'
        report_file_name = os.path.join('outputs', 'reports',
                                        output_file_name + '.txt')
        json_file_name = os.path.join('outputs', 'reports',
                                      output_file_name + '.json')
        pygrams.main([
            f'--outputs_name={output_file_name}', '-p=max', '-cpc=Y12',
            '--date_from=1998/01/01', '--date_to=2001/12/31', '-dh',
            'publication_date', '-ds', patent_pickle_file_name
        ])

        mock_open.assert_any_call(json_file_name, 'w')

        actual_json = mock_json_dump.call_args[0][0]
        expected_json = {
            'paths': {
                'data': patent_pickle_absolute_file_name,
                'tech_report': report_file_name
            },
            'month_year': {
                'from': 199801,
                'to': 200201
            },
            'parameters': {
                'pick': 'max'
            }
        }
        self.assertEqual(expected_json, actual_json)
Esempio n. 3
0
    def test_json_configuration_encoding_sum_no_time_weighting(
            self, mock_open, mock_json_dump):
        patent_pickle_file_name = 'USPTO-random-100.pkl.bz2'
        patent_pickle_absolute_file_name = os.path.abspath(
            os.path.join('data', patent_pickle_file_name))
        output_file_name = 'test'
        report_file_name = os.path.join('outputs', 'reports',
                                        output_file_name + '.txt')
        json_file_name = os.path.join('outputs', 'reports',
                                      output_file_name + '.json')
        pygrams.main([
            f'--outputs_name={output_file_name}', '-f=set', '-p=sum',
            '-cpc=Y12', '--date_from=1999/03/12', '--date_to=2000/11/30',
            '-dh', 'publication_date', '-ds', patent_pickle_file_name
        ])

        mock_open.assert_any_call(json_file_name, 'w')

        actual_json = mock_json_dump.call_args[0][0]
        expected_json = {
            'paths': {
                'data': patent_pickle_absolute_file_name,
                'tech_report': report_file_name
            },
            'month_year': {
                'from': '1999-03-12',
                'to': '2000-11-30'
            },
            'parameters': {
                'pick': 'sum',
                'time': False
            }
        }
        self.assertEqual(expected_json, actual_json)
Esempio n. 4
0
    def test_no_arguments_and_use_cache(self):
        # clear cached result
        import shutil
        shutil.rmtree(os.path.join('cached', 'out-mdf-0.05'), ignore_errors=True)

        # should make cache
        pygrams.main([])

        # load cache
        pygrams.main(['-uc', 'out-mdf-0.05'])
Esempio n. 5
0
    def test_simple_two_patents_unigrams_only_output_tfidf(
            self, mock_path_isfile, mock_makedirs, mock_bz2file, mock_open,
            mock_pickle_dump, mock_read_pickle):
        fake_df_data = {'abstract': ['abstract one', 'abstract two']}
        max_df = 1.0

        self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open,
                            mock_bz2file, mock_path_isfile)
        args = [
            '-ds', self.data_source_name, '--date_header', 'publication_date',
            '--max_document_frequency',
            str(max_df), '--max_ngrams', '1'
        ]

        pygrams.main(args)

        # tf(t) = num of occurrences / number of words in doc
        #
        # smoothing is false, so no modification to log numerator or denominator:
        # idf(d, t) = log [ n / df(d, t) ] + 1
        #
        # n = total number of docs
        #
        # norm='l2' by default

        tfidf_abstract = (1 / 2) * (np.log(2 / 2) + 1)
        tfidf_one = (1 / 2) * (np.log(2 / 1) + 1)
        l2norm = np.sqrt(tfidf_abstract * tfidf_abstract +
                         tfidf_one * tfidf_one)
        l2norm_tfidf_abstract = tfidf_abstract / l2norm
        l2norm_tfidf_one = tfidf_one / l2norm

        # Note that 'one' will have same weight as 'two' given where it appears

        def assert_tfidf_outputs(tfidf_matrix, feature_names):
            self.assertListEqual(feature_names, ['abstract', 'one', 'two'])
            tfidf_as_lists = tfidf_matrix.todense().tolist()
            self.assertListAlmostEqual(
                tfidf_as_lists[0],
                [l2norm_tfidf_abstract, l2norm_tfidf_one, 0],
                places=4)
            self.assertListAlmostEqual(
                tfidf_as_lists[1],
                [l2norm_tfidf_abstract, 0, l2norm_tfidf_one],
                places=4)

        self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump,
                                mock_makedirs, max_df, 200051, 200052)
Esempio n. 6
0
    def test_reports_unsupported_df_format(self, mock_path_isfile):

        unknown_filename = 'unknown.format'

        def isfile_fake(file_name):
            if file_name == os.path.join('data', unknown_filename):
                return True
            else:
                return False

        mock_path_isfile.side_effect = isfile_fake
        test_args = ['--doc_source', unknown_filename]
        try:
            pygrams.main(test_args)
            self.fail("should raise exception")
        except PygramsException as err:
            self.assertEqual(
                'Unsupported file: ' + os.path.join('data', unknown_filename),
                err.message)
Esempio n. 7
0
    def test_graph_creation(self, mock_open, mock_json_dump):
        fname = 'other'
        js_file_name = os.path.join('outputs', 'visuals', 'key-terms.js')
        json_file_name = os.path.join('outputs', 'reports', 'key-terms.json')
        graph_report_name = os.path.join('outputs', 'reports',
                                         fname + '_graph.txt')

        test_args = [
            '--doc_source', 'USPTO-random-100.pkl.bz2', '--date_header',
            'publication_date', '-o', 'graph', '--outputs_name', fname
        ]
        pygrams.main(test_args)

        mock_open.assert_any_call(json_file_name, 'w')
        mock_open.assert_any_call(js_file_name, 'w')
        mock_open.assert_any_call(graph_report_name, 'w')

        actual_json = mock_json_dump.call_args_list[0][0][0]
        self.assertIn('nodes', actual_json)
        self.assertIn('links', actual_json)
Esempio n. 8
0
    def test_simple_output_tfidf(self, mock_path_isfile, mock_makedirs,
                                 mock_bz2file, mock_open, mock_pickle_dump,
                                 mock_read_pickle):
        fake_df_data = {'abstract': ['abstract']}
        max_df = 1.0
        self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open,
                            mock_bz2file, mock_path_isfile)
        args = [
            '-ds', self.data_source_name, '--date_header', 'publication_date',
            '--max_document_frequency',
            str(max_df)
        ]

        pygrams.main(args)

        def assert_tfidf_outputs(tfidf_matrix, feature_names):
            self.assertEqual(tfidf_matrix.todense(), np.ones(shape=(1, 1)),
                             'TFIDF should be 1x1 matrix of 1')
            self.assertListEqual(feature_names, ['abstract'])

        self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump,
                                mock_makedirs, max_df)
Esempio n. 9
0
 def test_cpc(self):
     pygrams.main(['-cpc', 'Y02', '-ds', 'USPTO-random-10000.pkl.bz2'])
Esempio n. 10
0
 def test_date_from_and_to(self):
     pygrams.main(['-dh', 'publication_date', '-df', '2000/03/01', '-dt', '2016/07/31'])
Esempio n. 11
0
 def test_date_from(self):
     pygrams.main(['-dh', 'publication_date', '-df', '2000/02/20'])
Esempio n. 12
0
 def test_prefilter_terms_10000(self):
     pygrams.main(['--prefilter_terms', '10000'])
Esempio n. 13
0
 def test_pt(self):
     pygrams.main(['-pt', '0'])
Esempio n. 14
0
 def test_mdf(self):
     pygrams.main(['-mdf', '0.05'])
Esempio n. 15
0
 def test_graph(self):
     pygrams.main(['-o', 'graph'])
Esempio n. 16
0
 def test_search_terms(self):
     pygrams.main(['-st', 'pharmacy', 'medicine', 'chemist'])
Esempio n. 17
0
    def test_simple_output_tfidf_pickle_and_unpickle_and_write_to_timeseries(
            self, mock_path_isfile, mock_output_makedirs, mock_output_bz2file,
            mock_output_pickle_dump, mock_utils_makedirs, mock_utils_bz2file,
            mock_utils_pickle_dump, mock_utils_read_pickle, mock_open,
            mock_factory_read_pickle):
        fake_df_data = {'abstract': ['abstract']}

        # Make a note of the dumped TFIDF object for later
        self.preparePyGrams(fake_df_data, mock_factory_read_pickle, mock_open,
                            mock_utils_bz2file, mock_path_isfile)
        args = [
            '-ds', self.data_source_name, '--date_header', 'publication_date',
            '--max_document_frequency', '1.0'
        ]
        pygrams.main(args)

        # reset static object
        WordAnalyzer.tokenizer = None
        WordAnalyzer.preprocess = None
        WordAnalyzer.ngram_range = None
        WordAnalyzer.stemmed_stop_word_set_n = None
        WordAnalyzer.stemmed_stop_word_set_uni = None

        # Fail if original data frame is requested from disc
        def factory_read_pickle_fake(pickle_file_name):
            self.fail(
                f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle'
            )

        def find_matching_pickle(mock_pickle_dump, pickle_file_name):
            for args in mock_pickle_dump.call_args_list:
                if args[0][1] == pickle_file_name:
                    return args[0][0]
            return None

        dumped_tfidf_file_name = os.path.join(
            'cached', self.out_name + '-mdf-1.0-200052-200052',
            'tfidf.pkl.bz2')
        self.dumped_tfidf = find_matching_pickle(mock_utils_pickle_dump,
                                                 dumped_tfidf_file_name)

        dumped_dates_file_name = os.path.join(
            'cached', self.out_name + '-mdf-1.0-200052-200052',
            'dates.pkl.bz2')
        self.dumped_dates = find_matching_pickle(mock_utils_pickle_dump,
                                                 dumped_dates_file_name)

        dumped_cpc_dict_file_name = os.path.join(
            'cached', self.out_name + '-mdf-1.0-200052-200052',
            'cpc_dict.pkl.bz2')
        self.dumped_cpc_dict = find_matching_pickle(mock_utils_pickle_dump,
                                                    dumped_cpc_dict_file_name)

        mock_factory_read_pickle.side_effect = factory_read_pickle_fake
        mock_utils_pickle_dump.reset_mock(return_value=True, side_effect=True)

        # Instead support TFIDF pickle read - and return the TFIDF object previously saved to disc
        def pipeline_read_pickle_fake(pickle_file_name):
            if pickle_file_name == dumped_tfidf_file_name:
                return self.dumped_tfidf
            elif pickle_file_name == dumped_dates_file_name:
                return self.dumped_dates
            elif pickle_file_name == dumped_cpc_dict_file_name:
                return self.dumped_cpc_dict
            else:
                self.fail(
                    f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle'
                )

        mock_output_bz2file.side_effect = bz2file_fake
        mock_utils_read_pickle.side_effect = pipeline_read_pickle_fake
        mock_utils_read_pickle.return_value = self.dumped_tfidf
        args = [
            '-ds', self.data_source_name, '-ts', '-tc', '--date_header',
            'publication_date', '--max_document_frequency', '1.0',
            '--use_cache', self.out_name + '-mdf-1.0-200052-200052'
        ]
        pygrams.main(args)

        def assert_timeseries_outputs(term_counts_per_week, feature_names,
                                      number_of_documents_per_week,
                                      week_iso_dates):
            self.assertEqual(term_counts_per_week.todense(),
                             np.ones(shape=(1, 1)),
                             'term counts should be 1x1 matrix of 1')
            self.assertListEqual(feature_names, ['abstract'])
            self.assertListEqual(number_of_documents_per_week, [1])
            self.assertListEqual(week_iso_dates, [200052])

        self.assertTimeSeriesOutputs(assert_timeseries_outputs,
                                     mock_output_pickle_dump,
                                     mock_output_makedirs)
Esempio n. 18
0
 def test_wordcloud(self):
     pygrams.main(['-o', 'wordcloud'])
Esempio n. 19
0
 def test_10000_patents(self):
     pygrams.main(['-ds', 'USPTO-random-10000.pkl.bz2'])
Esempio n. 20
0
 def test_multiplot(self):
     pygrams.main(['-o', 'multiplot', '-ts', '-dh', 'publication_date'])
Esempio n. 21
0
 def test_mn_mx_unigrams(self):
     pygrams.main(['-mn', '1', '-mx', '1'])