def test_unibitri_reduction_output_termcounts( self, mock_path_isfile, mock_of_makedirs, mock_of_bz2file, mock_of_dump, mock_open, mock_utils_bz2file, mock_utils_dump, mock_read_pickle): fake_df_data = { 'abstract': ['abstract 1, of the patent with extra stuff'] } mock_of_bz2file.side_effect = bz2file_fake self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_utils_bz2file, mock_path_isfile) args = [ '-ts', '-tc', '-ds', self.data_source_name, '--id_header', 'patent_id', '--date_header', 'publication_date', '--max_document_frequency', '1.0' ] pygrams.main(args) def assert_outputs(term_counts_per_week, feature_names, number_of_documents_per_week, week_iso_dates): self.assertListEqual(feature_names, ['abstract', 'extra stuff', 'patent', 'with']) term_counts_as_lists = term_counts_per_week.todense().tolist() self.assertListEqual(term_counts_as_lists[0], [1, 1, 1, 1]) self.assertListEqual(number_of_documents_per_week, [1]) self.assertListEqual(week_iso_dates, [200052]) self.assertTimeSeriesOutputs(assert_outputs, mock_of_dump, mock_of_makedirs)
def test_json_configuration_encoding_maximal(self, mock_open, mock_json_dump): patent_pickle_file_name = 'USPTO-random-100.pkl.bz2' patent_pickle_absolute_file_name = os.path.abspath( os.path.join('data', patent_pickle_file_name)) output_file_name = 'test' report_file_name = os.path.join('outputs', 'reports', output_file_name + '.txt') json_file_name = os.path.join('outputs', 'reports', output_file_name + '.json') pygrams.main([ f'--outputs_name={output_file_name}', '-p=max', '-cpc=Y12', '--date_from=1998/01/01', '--date_to=2001/12/31', '-dh', 'publication_date', '-ds', patent_pickle_file_name ]) mock_open.assert_any_call(json_file_name, 'w') actual_json = mock_json_dump.call_args[0][0] expected_json = { 'paths': { 'data': patent_pickle_absolute_file_name, 'tech_report': report_file_name }, 'month_year': { 'from': 199801, 'to': 200201 }, 'parameters': { 'pick': 'max' } } self.assertEqual(expected_json, actual_json)
def test_json_configuration_encoding_sum_no_time_weighting( self, mock_open, mock_json_dump): patent_pickle_file_name = 'USPTO-random-100.pkl.bz2' patent_pickle_absolute_file_name = os.path.abspath( os.path.join('data', patent_pickle_file_name)) output_file_name = 'test' report_file_name = os.path.join('outputs', 'reports', output_file_name + '.txt') json_file_name = os.path.join('outputs', 'reports', output_file_name + '.json') pygrams.main([ f'--outputs_name={output_file_name}', '-f=set', '-p=sum', '-cpc=Y12', '--date_from=1999/03/12', '--date_to=2000/11/30', '-dh', 'publication_date', '-ds', patent_pickle_file_name ]) mock_open.assert_any_call(json_file_name, 'w') actual_json = mock_json_dump.call_args[0][0] expected_json = { 'paths': { 'data': patent_pickle_absolute_file_name, 'tech_report': report_file_name }, 'month_year': { 'from': '1999-03-12', 'to': '2000-11-30' }, 'parameters': { 'pick': 'sum', 'time': False } } self.assertEqual(expected_json, actual_json)
def test_no_arguments_and_use_cache(self): # clear cached result import shutil shutil.rmtree(os.path.join('cached', 'out-mdf-0.05'), ignore_errors=True) # should make cache pygrams.main([]) # load cache pygrams.main(['-uc', 'out-mdf-0.05'])
def test_simple_two_patents_unigrams_only_output_tfidf( self, mock_path_isfile, mock_makedirs, mock_bz2file, mock_open, mock_pickle_dump, mock_read_pickle): fake_df_data = {'abstract': ['abstract one', 'abstract two']} max_df = 1.0 self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile) args = [ '-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', str(max_df), '--max_ngrams', '1' ] pygrams.main(args) # tf(t) = num of occurrences / number of words in doc # # smoothing is false, so no modification to log numerator or denominator: # idf(d, t) = log [ n / df(d, t) ] + 1 # # n = total number of docs # # norm='l2' by default tfidf_abstract = (1 / 2) * (np.log(2 / 2) + 1) tfidf_one = (1 / 2) * (np.log(2 / 1) + 1) l2norm = np.sqrt(tfidf_abstract * tfidf_abstract + tfidf_one * tfidf_one) l2norm_tfidf_abstract = tfidf_abstract / l2norm l2norm_tfidf_one = tfidf_one / l2norm # Note that 'one' will have same weight as 'two' given where it appears def assert_tfidf_outputs(tfidf_matrix, feature_names): self.assertListEqual(feature_names, ['abstract', 'one', 'two']) tfidf_as_lists = tfidf_matrix.todense().tolist() self.assertListAlmostEqual( tfidf_as_lists[0], [l2norm_tfidf_abstract, l2norm_tfidf_one, 0], places=4) self.assertListAlmostEqual( tfidf_as_lists[1], [l2norm_tfidf_abstract, 0, l2norm_tfidf_one], places=4) self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df, 200051, 200052)
def test_reports_unsupported_df_format(self, mock_path_isfile): unknown_filename = 'unknown.format' def isfile_fake(file_name): if file_name == os.path.join('data', unknown_filename): return True else: return False mock_path_isfile.side_effect = isfile_fake test_args = ['--doc_source', unknown_filename] try: pygrams.main(test_args) self.fail("should raise exception") except PygramsException as err: self.assertEqual( 'Unsupported file: ' + os.path.join('data', unknown_filename), err.message)
def test_graph_creation(self, mock_open, mock_json_dump): fname = 'other' js_file_name = os.path.join('outputs', 'visuals', 'key-terms.js') json_file_name = os.path.join('outputs', 'reports', 'key-terms.json') graph_report_name = os.path.join('outputs', 'reports', fname + '_graph.txt') test_args = [ '--doc_source', 'USPTO-random-100.pkl.bz2', '--date_header', 'publication_date', '-o', 'graph', '--outputs_name', fname ] pygrams.main(test_args) mock_open.assert_any_call(json_file_name, 'w') mock_open.assert_any_call(js_file_name, 'w') mock_open.assert_any_call(graph_report_name, 'w') actual_json = mock_json_dump.call_args_list[0][0][0] self.assertIn('nodes', actual_json) self.assertIn('links', actual_json)
def test_simple_output_tfidf(self, mock_path_isfile, mock_makedirs, mock_bz2file, mock_open, mock_pickle_dump, mock_read_pickle): fake_df_data = {'abstract': ['abstract']} max_df = 1.0 self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile) args = [ '-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', str(max_df) ] pygrams.main(args) def assert_tfidf_outputs(tfidf_matrix, feature_names): self.assertEqual(tfidf_matrix.todense(), np.ones(shape=(1, 1)), 'TFIDF should be 1x1 matrix of 1') self.assertListEqual(feature_names, ['abstract']) self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df)
def test_cpc(self): pygrams.main(['-cpc', 'Y02', '-ds', 'USPTO-random-10000.pkl.bz2'])
def test_date_from_and_to(self): pygrams.main(['-dh', 'publication_date', '-df', '2000/03/01', '-dt', '2016/07/31'])
def test_date_from(self): pygrams.main(['-dh', 'publication_date', '-df', '2000/02/20'])
def test_prefilter_terms_10000(self): pygrams.main(['--prefilter_terms', '10000'])
def test_pt(self): pygrams.main(['-pt', '0'])
def test_mdf(self): pygrams.main(['-mdf', '0.05'])
def test_graph(self): pygrams.main(['-o', 'graph'])
def test_search_terms(self): pygrams.main(['-st', 'pharmacy', 'medicine', 'chemist'])
def test_simple_output_tfidf_pickle_and_unpickle_and_write_to_timeseries( self, mock_path_isfile, mock_output_makedirs, mock_output_bz2file, mock_output_pickle_dump, mock_utils_makedirs, mock_utils_bz2file, mock_utils_pickle_dump, mock_utils_read_pickle, mock_open, mock_factory_read_pickle): fake_df_data = {'abstract': ['abstract']} # Make a note of the dumped TFIDF object for later self.preparePyGrams(fake_df_data, mock_factory_read_pickle, mock_open, mock_utils_bz2file, mock_path_isfile) args = [ '-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0' ] pygrams.main(args) # reset static object WordAnalyzer.tokenizer = None WordAnalyzer.preprocess = None WordAnalyzer.ngram_range = None WordAnalyzer.stemmed_stop_word_set_n = None WordAnalyzer.stemmed_stop_word_set_uni = None # Fail if original data frame is requested from disc def factory_read_pickle_fake(pickle_file_name): self.fail( f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle' ) def find_matching_pickle(mock_pickle_dump, pickle_file_name): for args in mock_pickle_dump.call_args_list: if args[0][1] == pickle_file_name: return args[0][0] return None dumped_tfidf_file_name = os.path.join( 'cached', self.out_name + '-mdf-1.0-200052-200052', 'tfidf.pkl.bz2') self.dumped_tfidf = find_matching_pickle(mock_utils_pickle_dump, dumped_tfidf_file_name) dumped_dates_file_name = os.path.join( 'cached', self.out_name + '-mdf-1.0-200052-200052', 'dates.pkl.bz2') self.dumped_dates = find_matching_pickle(mock_utils_pickle_dump, dumped_dates_file_name) dumped_cpc_dict_file_name = os.path.join( 'cached', self.out_name + '-mdf-1.0-200052-200052', 'cpc_dict.pkl.bz2') self.dumped_cpc_dict = find_matching_pickle(mock_utils_pickle_dump, dumped_cpc_dict_file_name) mock_factory_read_pickle.side_effect = factory_read_pickle_fake mock_utils_pickle_dump.reset_mock(return_value=True, side_effect=True) # Instead support TFIDF pickle read - and return the TFIDF object previously saved to disc def pipeline_read_pickle_fake(pickle_file_name): if pickle_file_name == dumped_tfidf_file_name: return self.dumped_tfidf elif pickle_file_name == dumped_dates_file_name: return self.dumped_dates elif pickle_file_name == dumped_cpc_dict_file_name: return self.dumped_cpc_dict else: self.fail( f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle' ) mock_output_bz2file.side_effect = bz2file_fake mock_utils_read_pickle.side_effect = pipeline_read_pickle_fake mock_utils_read_pickle.return_value = self.dumped_tfidf args = [ '-ds', self.data_source_name, '-ts', '-tc', '--date_header', 'publication_date', '--max_document_frequency', '1.0', '--use_cache', self.out_name + '-mdf-1.0-200052-200052' ] pygrams.main(args) def assert_timeseries_outputs(term_counts_per_week, feature_names, number_of_documents_per_week, week_iso_dates): self.assertEqual(term_counts_per_week.todense(), np.ones(shape=(1, 1)), 'term counts should be 1x1 matrix of 1') self.assertListEqual(feature_names, ['abstract']) self.assertListEqual(number_of_documents_per_week, [1]) self.assertListEqual(week_iso_dates, [200052]) self.assertTimeSeriesOutputs(assert_timeseries_outputs, mock_output_pickle_dump, mock_output_makedirs)
def test_wordcloud(self): pygrams.main(['-o', 'wordcloud'])
def test_10000_patents(self): pygrams.main(['-ds', 'USPTO-random-10000.pkl.bz2'])
def test_multiplot(self): pygrams.main(['-o', 'multiplot', '-ts', '-dh', 'publication_date'])
def test_mn_mx_unigrams(self): pygrams.main(['-mn', '1', '-mx', '1'])