def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None): self.__emergence_index = emergence_index # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_date_dict = docs_mask_dict['timeseries_date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if cached_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) if docs_mask_dict['date_header'] is None: self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}') self.__dates = None else: self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header']) min_date = min(self.__dates) max_date = max(self.__dates) self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}') utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name) utils.pickle_object('dates', self.__dates, self.__cached_folder_name) utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name) else: print(f'Reading document and TFIDF from pickle {cached_folder_name}') self.__cached_folder_name = path.join('cached', cached_folder_name) self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name) self.__dates = utils.unpickle_object('dates', self.__cached_folder_name) self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name) if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init( tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix print(f'Creating timeseries matrix...') if cached_folder_name is None or not ( path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))): self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates) [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates] = self.__timeseries_data utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name) utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name) utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name) else: self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name) self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name) self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name) self.__term_ngrams = self.__tfidf_obj.feature_names self.__M = m_steps_ahead # TODO: define period from command line, then cascade through the code term_counts_per_week_csc = self.__term_counts_per_week.tocsc() self.__timeseries_quarterly = [] self.__timeseries_intercept = [] self.__timeseries_derivatives = [] self.__timeseries_quarterly_smoothed = [] self.__term_nonzero_dates = [] all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly( self.__weekly_iso_dates, self.__number_of_patents_per_week) # find indexes for date-range min_date = max_date = None if self.__timeseries_date_dict is not None: min_date = self.__timeseries_date_dict['from'] max_date = self.__timeseries_date_dict['to'] min_i = 0 max_i = len(all_quarters) for i, quarter in enumerate(all_quarters): if min_date is not None and min_date < quarter: break min_i = i for i, quarter in enumerate(all_quarters): if max_date is not None and max_date < quarter: break max_i = i self.__lims = [min_i, max_i] self.__timeseries_quarterly_smoothed = None if sma is None else [] for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating quarterly timeseries', leave=False, unit_scale=True): row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index) weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices] non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates, row_values) non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters) self.__timeseries_quarterly.append(quarterly_values) if emergence_index == 'gradients' or sma == 'kalman': if cached_folder_name is None or not ( path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name)) and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))): for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term', desc='smoothing quarterly timeseries with kalman filter', leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)): _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing() smooth_series = smooth_series_s[0].tolist()[0] smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) derivatives = smooth_series_s[1].tolist()[0] self.__timeseries_derivatives.append(derivatives) utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name) utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name) else: self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name) self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name) if sma == 'savgol': for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term', desc='savgol smoothing quarterly timeseries', leave=False, unit_scale=True): smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) em = Emergence(all_quarterly_values[min_i:max_i]) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): if term_weights[term_index] == 0.0: continue term_ngram = self.__term_ngrams[term_index] if self.__timeseries_quarterly_smoothed is not None: quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i] else: quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i] if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float( patents_per_quarter_threshold): continue if emergence_index == 'quadratic': escore = em.escore2(quarterly_values) elif emergence_index == 'porter': if not em.is_emergence_candidate(quarterly_values): continue escore = em.calculate_escore(quarterly_values) elif emergence_index == 'gradients': derivatives = self.__timeseries_derivatives[term_index][min_i:max_i] escore = em.net_growth(quarterly_values, derivatives) else: weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0] escore = em.escore_exponential(weekly_values) self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__declining.reverse() self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]
def __init__( self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, curves=True, nterms=50, minimum_patents_per_quarter=20, ): # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if pickled_tfidf_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text( text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted( [x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features( self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print( f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) self.__dates = scripts.utils.date_utils.generate_year_week_dates( dataframe, docs_mask_dict['date_header']) base_pickle_path = path.join('outputs', 'tfidf') makedirs(base_pickle_path, exist_ok=True) def pickle_object(short_name, obj): folder_name = path.join(base_pickle_path, output_name + f'-mdf-{max_df}') makedirs(folder_name, exist_ok=True) file_name = path.join( folder_name, output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2') with bz2.BZ2File(file_name, 'wb') as pickle_file: pickle.dump(obj, pickle_file, protocol=4, fix_imports=False) pickle_object('tfidf', self.__tfidf_obj) pickle_object('dates', self.__dates) pickle_object('cpc_dict', self.__cpc_dict) else: print( f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}' ) base_folder = path.basename(pickled_tfidf_folder_name) pickled_base_file_name = path.join(pickled_tfidf_folder_name, base_folder) self.__tfidf_obj = read_pickle(pickled_base_file_name + '-tfidf.pkl.bz2') self.__dates = read_pickle(pickled_base_file_name + '-dates.pkl.bz2') self.__cpc_dict = read_pickle(pickled_base_file_name + '-cpc_dict.pkl.bz2') if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print( f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter( self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global( tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) self.__term_score_tuples = utils.stop_tup( self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return print(f'Creating timeseries matrix...') self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data( self.__dates) [ self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates ] = self.__timeseries_data self.__M = m_steps_ahead term_counts_per_week_csc = self.__term_counts_per_week.tocsc() em = Emergence(self.__number_of_patents_per_week) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): term_ngram = self.__term_ngrams[term_index] row_indices, row_values = utils.get_row_indices_and_values( term_counts_per_week_csc, term_index) if len(row_values) == 0: continue weekly_iso_dates = [ self.__weekly_iso_dates[x] for x in row_indices ] _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly( weekly_iso_dates, row_values) if max(quarterly_values) < minimum_patents_per_quarter: continue if em.init_vars(row_indices, row_values, porter=not curves): escore = em.calculate_escore() if not curves else em.escore2() self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__stationary = utils.stationary_terms(self.__emergence_list, nterms2)
def test_reads_pickles(self): df = factory.get('data/USPTO-random-100.pkl.bz2') self.assertEquals(len(df['abstract']), 100)
def test_reads_csv(self): df = factory.get('data/USPTO-random-100.csv') self.assertListEqual(list(self.__df['abstract']), list(df['abstract']))
def test_reads_xlsx(self): df = factory.get('tests/data/USPTO-random-100.xlsx') self.assertListEqual(list(self.__df['abstract']), list(df['abstract']))
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), normalize_rows=False, text_header='abstract', term_counts=False, pickled_tf_idf_file_name=None, max_df=0.1, user_ngrams=None, output_name=None, emerging_technology=None): # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__time = docs_mask_dict['time'] self.__pick_method = pick_method # calculate or fetch tf-idf mat if pickled_tf_idf_file_name is None: self.__dataframe = datafactory.get(data_filename) checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts) remove_empty_documents(self.__dataframe, text_header) self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) self.__text_lengths = self.__dataframe[text_header].map( len).tolist() self.__dataframe.drop(columns=[text_header], inplace=True) tfidf_filename = path.join( 'outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2') makedirs(path.dirname(tfidf_filename), exist_ok=True) with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file: pickle.dump( (self.__tfidf_obj, self.__dataframe, self.__text_lengths), pickle_file, protocol=4) else: print( f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}' ) self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle( pickled_tf_idf_file_name) if docs_mask_dict['date_header'] is None: print('Document dates not specified') else: min_date = min(self.__dataframe[docs_mask_dict['date_header']]) max_date = max(self.__dataframe[docs_mask_dict['date_header']]) print( f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}' ) WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter(self.__dataframe, docs_mask_dict).doc_weights # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) doc_weights = DocumentsWeights(self.__dataframe, docs_mask_dict['time'], docs_mask_dict['cite'], docs_mask_dict['date_header'], self.__text_lengths, norm_rows=normalize_rows).weights doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)] # todo: this is another weight function... # term weights - embeddings filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=0.75) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_weights, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_matrix = self.__tfidf_obj.tfidf_matrix tfidf_masked = tfidf_mask.multiply(tfidf_matrix) tfidf_masked = utils.remove_all_null_rows(tfidf_masked) print( f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents' ) # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__term_counts_data = None if term_counts or emerging_technology: self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count( self.__dataframe, docs_mask_dict['date_header']) # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) self.__term_score_tuples = utils.stop_tup( self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)