def setUpClass(cls): cls.num_ngrams = 5 cold_tfidf = tfidf_from_text(ReferenceData.cold_df, tokenizer=LemmaTokenizer(), ngram_range=(2, 3)) random_tfidf = tfidf_from_text(ReferenceData.random_df, tokenizer=LemmaTokenizer(), ngram_range=(2, 3)) cls.tfocus = TermFocus(cold_tfidf, random_tfidf)
def setUpClass(cls): num_ngrams = 50 min_n = 2 max_n = 3 max_df=0.3 ngram_range = (min_n, max_n) df = pd.read_pickle(FilePaths.us_patents_random_1000_pickle_name) tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=StemTokenizer()) doc_weights = list(np.ones(len(df))) # term weights - embeddings filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None) term_weights = filter_output_obj.ngram_weights_vec tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range, unbias=True) tfidf_mask_obj.update_mask(doc_weights, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # mask the tfidf matrix tfidf_matrix = tfidf_obj.tfidf_matrix tfidf_masked = tfidf_mask.multiply(tfidf_matrix) tfidf_masked = utils.remove_all_null_rows(tfidf_masked) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents') cls.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, tfidf_obj.feature_names) term_score_tuples = cls.__tfidf_reduce_obj.extract_ngrams_from_docset('sum') graph_obj = TermsGraph(term_score_tuples[:num_ngrams], cls.__tfidf_reduce_obj) graph = graph_obj.graph cls.__links = graph['links'] cls.__nodes = graph['nodes']
def setUpClass(cls): df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name) tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=(1, 3), max_document_frequency=0.1, tokenizer=LemmaTokenizer()) nmf_topics = 5 cls.__nmf = nmf_topic_modelling(nmf_topics, tfidf_obj.tfidf_matrix)
def init_mask(self, cpc, min_n, uni_factor=0.8): docs_mask_dict = { 'filter_by': 'union', 'cpc': cpc, 'time': None, 'cite': [], 'columns': None, 'date': None, 'date_header': None } self.__tfidf_obj = tfidf_from_text(self.__df['abstract'], ngram_range=(min_n, self.__max_n), max_document_frequency=self.__max_df, tokenizer=StemTokenizer()) cpc_dict = utils.cpc_dict(self.__df) self.__dates = generate_year_week_dates(self.__df, docs_mask_dict['date_header']) doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, cpc_dict, self.__df.shape[0]).doc_filters # term weights - embeddings filter_output_obj = FilterTerms(self.__tfidf_obj.feature_names, None) term_weights = filter_output_obj.ngram_weights_vec tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=(min_n, self.__max_n), uni_factor=uni_factor, unbias=True) tfidf_mask_obj.update_mask(doc_filters, term_weights) self.__tfidf_mask = tfidf_mask_obj.tfidf_mask
def setUp(self): df = pd.read_pickle(FilePaths.us_patents_random_100_pickle_name) tfidf_obj = tfidf_from_text(df['abstract'], ngram_range=(1, 3), max_document_frequency=0.1, tokenizer=LemmaTokenizer()) self.feature_names = tfidf_obj.feature_names
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None): self.__emergence_index = emergence_index # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_date_dict = docs_mask_dict['timeseries_date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if cached_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) if docs_mask_dict['date_header'] is None: self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}') self.__dates = None else: self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header']) min_date = min(self.__dates) max_date = max(self.__dates) self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}') utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name) utils.pickle_object('dates', self.__dates, self.__cached_folder_name) utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name) else: print(f'Reading document and TFIDF from pickle {cached_folder_name}') self.__cached_folder_name = path.join('cached', cached_folder_name) self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name) self.__dates = utils.unpickle_object('dates', self.__cached_folder_name) self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name) if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init( tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix print(f'Creating timeseries matrix...') if cached_folder_name is None or not ( path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))): self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates) [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates] = self.__timeseries_data utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name) utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name) utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name) else: self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name) self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name) self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name) self.__term_ngrams = self.__tfidf_obj.feature_names self.__M = m_steps_ahead # TODO: define period from command line, then cascade through the code term_counts_per_week_csc = self.__term_counts_per_week.tocsc() self.__timeseries_quarterly = [] self.__timeseries_intercept = [] self.__timeseries_derivatives = [] self.__timeseries_quarterly_smoothed = [] self.__term_nonzero_dates = [] all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly( self.__weekly_iso_dates, self.__number_of_patents_per_week) # find indexes for date-range min_date = max_date = None if self.__timeseries_date_dict is not None: min_date = self.__timeseries_date_dict['from'] max_date = self.__timeseries_date_dict['to'] min_i = 0 max_i = len(all_quarters) for i, quarter in enumerate(all_quarters): if min_date is not None and min_date < quarter: break min_i = i for i, quarter in enumerate(all_quarters): if max_date is not None and max_date < quarter: break max_i = i self.__lims = [min_i, max_i] self.__timeseries_quarterly_smoothed = None if sma is None else [] for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating quarterly timeseries', leave=False, unit_scale=True): row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index) weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices] non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates, row_values) non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters) self.__timeseries_quarterly.append(quarterly_values) if emergence_index == 'gradients' or sma == 'kalman': if cached_folder_name is None or not ( path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name)) and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))): for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term', desc='smoothing quarterly timeseries with kalman filter', leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)): _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing() smooth_series = smooth_series_s[0].tolist()[0] smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) derivatives = smooth_series_s[1].tolist()[0] self.__timeseries_derivatives.append(derivatives) utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name) utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name) else: self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name) self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name) if sma == 'savgol': for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term', desc='savgol smoothing quarterly timeseries', leave=False, unit_scale=True): smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) em = Emergence(all_quarterly_values[min_i:max_i]) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): if term_weights[term_index] == 0.0: continue term_ngram = self.__term_ngrams[term_index] if self.__timeseries_quarterly_smoothed is not None: quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i] else: quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i] if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float( patents_per_quarter_threshold): continue if emergence_index == 'quadratic': escore = em.escore2(quarterly_values) elif emergence_index == 'porter': if not em.is_emergence_candidate(quarterly_values): continue escore = em.calculate_escore(quarterly_values) elif emergence_index == 'gradients': derivatives = self.__timeseries_derivatives[term_index][min_i:max_i] escore = em.net_growth(quarterly_values, derivatives) else: weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0] escore = em.escore_exponential(weekly_values) self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__declining.reverse() self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]
def __init__( self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, curves=True, nterms=50, minimum_patents_per_quarter=20, ): # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if pickled_tfidf_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text( text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted( [x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features( self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print( f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) self.__dates = scripts.utils.date_utils.generate_year_week_dates( dataframe, docs_mask_dict['date_header']) base_pickle_path = path.join('outputs', 'tfidf') makedirs(base_pickle_path, exist_ok=True) def pickle_object(short_name, obj): folder_name = path.join(base_pickle_path, output_name + f'-mdf-{max_df}') makedirs(folder_name, exist_ok=True) file_name = path.join( folder_name, output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2') with bz2.BZ2File(file_name, 'wb') as pickle_file: pickle.dump(obj, pickle_file, protocol=4, fix_imports=False) pickle_object('tfidf', self.__tfidf_obj) pickle_object('dates', self.__dates) pickle_object('cpc_dict', self.__cpc_dict) else: print( f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}' ) base_folder = path.basename(pickled_tfidf_folder_name) pickled_base_file_name = path.join(pickled_tfidf_folder_name, base_folder) self.__tfidf_obj = read_pickle(pickled_base_file_name + '-tfidf.pkl.bz2') self.__dates = read_pickle(pickled_base_file_name + '-dates.pkl.bz2') self.__cpc_dict = read_pickle(pickled_base_file_name + '-cpc_dict.pkl.bz2') if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print( f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter( self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global( tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) self.__term_score_tuples = utils.stop_tup( self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return print(f'Creating timeseries matrix...') self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data( self.__dates) [ self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates ] = self.__timeseries_data self.__M = m_steps_ahead term_counts_per_week_csc = self.__term_counts_per_week.tocsc() em = Emergence(self.__number_of_patents_per_week) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): term_ngram = self.__term_ngrams[term_index] row_indices, row_values = utils.get_row_indices_and_values( term_counts_per_week_csc, term_index) if len(row_values) == 0: continue weekly_iso_dates = [ self.__weekly_iso_dates[x] for x in row_indices ] _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly( weekly_iso_dates, row_values) if max(quarterly_values) < minimum_patents_per_quarter: continue if em.init_vars(row_indices, row_values, porter=not curves): escore = em.calculate_escore() if not curves else em.escore2() self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__stationary = utils.stationary_terms(self.__emergence_list, nterms2)
def test_table(self): max_n = 3 min_n = 2 ngram_multiplier = 4 num_ngrams_report = 25 num_ngrams_wordcloud = 25 num_ngrams = max(num_ngrams_report, num_ngrams_wordcloud) tfidf_cold = tfidf_from_text(ReferenceData.cold_df, tokenizer=LemmaTokenizer(), ngram_range=(min_n, max_n)) tfidf_random = tfidf_from_text(ReferenceData.random_df, tokenizer=LemmaTokenizer(), ngram_range=(min_n, max_n)) citation_count_dict = { 1: 10, 2: 3, 101: 2, 102: 0, 103: 5, 104: 4, 105: 10 } args = FakeArgs() args.pick = 'sum' args.time = False args.focus = 'chi2' register_writer(TestTableOutput.FakeWriter) fake_writer = TestTableOutput.FakeWriter('spreadsheet.fake') table_output(tfidf_cold, tfidf_random, num_ngrams, args, ngram_multiplier, fake_writer, citation_count_dict) # Check sheet headings... self.assertListEqual([ None, 'Term', 'Score', 'Rank', 'Focus chi2 Score', 'Focus chi2 Rank', 'Diff Base to Focus Rank', 'Time Score', 'Time Rank', 'Diff Base to Time Rank', 'Citation Score', 'Citation Rank', 'Diff Base to Citation Rank' ], fake_writer.sheets['Summary'][0]) self.assertListEqual([None, 'Term', 'Score', 'Rank'], fake_writer.sheets['Base'][0]) self.assertListEqual( [None, 'Term', 'Focus chi2 Score', 'Focus chi2 Rank'], fake_writer.sheets['Focus'][0]) self.assertListEqual([None, 'Term', 'Time Score', 'Time Rank'], fake_writer.sheets['Time'][0]) self.assertListEqual([None, 'Term', 'Citation Score', 'Citation Rank'], fake_writer.sheets['Cite'][0]) # Base sheet should match summary sheet for y in range(25): for x in range(4): self.assertEqual(fake_writer.sheets['Summary'][y + 1][x], fake_writer.sheets['Base'][y + 1][x])