def setUpClass(cls): [ cls.term_counts_per_week, cls.term_ngrams, cls.num_patents_per_week, cls.week_iso_dates ] = pd.read_pickle( os.path.join('data', 'USPTO-random-500000-term_counts.pkl.bz2')) cls.term_counts_per_week_csc = cls.term_counts_per_week.tocsc() cls.em = Emergence(cls.num_patents_per_week)
def test_emergent_neg_7_15(self): # Arrange (add an extra leading 0 for the first 53 week year) weekly_values = [10] * 10 + [0] + [5] * self.weeks + [4] * self.weeks + [3] * self.weeks + [2] * self.weeks \ + [1] * self.weeks + [0] * self.weeks escore_expected = -7 / 15 # Act escore_actual = Emergence.escore_exponential(weekly_values) # Assert self.assertAlmostEqual(escore_expected, escore_actual, places=self.places)
def setUpClass(cls): [ cls.term_counts_per_week, cls.term_ngrams, cls.num_patents_per_week, cls.week_iso_dates ] = pd.read_pickle( os.path.join('..', 'data', 'USPTO-random-500000-term_counts.pkl.bz2')) cls.all_yearly_dates, cls.all_yearly_values = timeseries_weekly_to_yearly( cls.week_iso_dates, cls.num_patents_per_week) cls.em = Emergence(cls.all_yearly_values) cls.term_counts_per_week_csc_common = cls.term_counts_per_week.tocsc()
def test_emergent_neg_1(self): # Arrange weekly_values = [10] * 10 + [3] * self.weeks + [0] * self.weeks + [ 0 ] * self.weeks escore_expected = -1 # Act escore_actual = Emergence.escore_exponential(weekly_values) # Assert self.assertAlmostEqual(escore_expected, escore_actual, places=self.places)
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None): self.__emergence_index = emergence_index # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_date_dict = docs_mask_dict['timeseries_date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if cached_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) if docs_mask_dict['date_header'] is None: self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}') self.__dates = None else: self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header']) min_date = min(self.__dates) max_date = max(self.__dates) self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}') utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name) utils.pickle_object('dates', self.__dates, self.__cached_folder_name) utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name) else: print(f'Reading document and TFIDF from pickle {cached_folder_name}') self.__cached_folder_name = path.join('cached', cached_folder_name) self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name) self.__dates = utils.unpickle_object('dates', self.__cached_folder_name) self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name) if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init( tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix print(f'Creating timeseries matrix...') if cached_folder_name is None or not ( path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))): self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates) [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates] = self.__timeseries_data utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name) utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name) utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name) else: self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name) self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name) self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name) self.__term_ngrams = self.__tfidf_obj.feature_names self.__M = m_steps_ahead # TODO: define period from command line, then cascade through the code term_counts_per_week_csc = self.__term_counts_per_week.tocsc() self.__timeseries_quarterly = [] self.__timeseries_intercept = [] self.__timeseries_derivatives = [] self.__timeseries_quarterly_smoothed = [] self.__term_nonzero_dates = [] all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly( self.__weekly_iso_dates, self.__number_of_patents_per_week) # find indexes for date-range min_date = max_date = None if self.__timeseries_date_dict is not None: min_date = self.__timeseries_date_dict['from'] max_date = self.__timeseries_date_dict['to'] min_i = 0 max_i = len(all_quarters) for i, quarter in enumerate(all_quarters): if min_date is not None and min_date < quarter: break min_i = i for i, quarter in enumerate(all_quarters): if max_date is not None and max_date < quarter: break max_i = i self.__lims = [min_i, max_i] self.__timeseries_quarterly_smoothed = None if sma is None else [] for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating quarterly timeseries', leave=False, unit_scale=True): row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index) weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices] non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates, row_values) non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters) self.__timeseries_quarterly.append(quarterly_values) if emergence_index == 'gradients' or sma == 'kalman': if cached_folder_name is None or not ( path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name)) and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))): for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term', desc='smoothing quarterly timeseries with kalman filter', leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)): _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing() smooth_series = smooth_series_s[0].tolist()[0] smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) derivatives = smooth_series_s[1].tolist()[0] self.__timeseries_derivatives.append(derivatives) utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name) utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name) else: self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name) self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name) if sma == 'savgol': for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term', desc='savgol smoothing quarterly timeseries', leave=False, unit_scale=True): smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) em = Emergence(all_quarterly_values[min_i:max_i]) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): if term_weights[term_index] == 0.0: continue term_ngram = self.__term_ngrams[term_index] if self.__timeseries_quarterly_smoothed is not None: quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i] else: quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i] if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float( patents_per_quarter_threshold): continue if emergence_index == 'quadratic': escore = em.escore2(quarterly_values) elif emergence_index == 'porter': if not em.is_emergence_candidate(quarterly_values): continue escore = em.calculate_escore(quarterly_values) elif emergence_index == 'gradients': derivatives = self.__timeseries_derivatives[term_index][min_i:max_i] escore = em.net_growth(quarterly_values, derivatives) else: weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0] escore = em.escore_exponential(weekly_values) self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__declining.reverse() self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]
def set_up_emergent_term(self): # Aim: # escore = 2 * active period trend + recent trend + mid-year to last year slope # active period trend = (term counts 5+6+7)/(sqrt(total 5) + sqrt(total 6) + sqrt(total 7) # - (term counts 1+2+3)/(sqrt(total 1) + sqrt(total 2) + sqrt(total 3)) # recent trend = 10 * (term count 6+7)/(sqrt(total 6) + sqrt(total 7)) # - (term counts 4 + 5)/(sqrt(total 4) + sqrt(total 5)) # mid-year to last year slope = 10 * ((term counts 7 / sqrt(total 7)) - (term counts 4/sqrt(total 4))) / (7-4) # # Also: emergent if: # term present for >3 years # >7 docs with term # # term records in active / # term records in base # # base term records / # base all records < 15% # single author set... weeks_per_period = 52 self.term_counts_matrix = np.zeros(shape=(weeks_per_period * 10, 9), dtype=np.int) # [0: emergent term, 1: non-emergent due to base, 2: non-emergent constant count, # 3: non-emergent decreasing count, 4: not 10 years data, 5: background, 6: background, # 7: two occurrences over 10 years, 8: all but one term occurs in base] # period 1 - base self.term_counts_matrix[0, :] = [1, 0, 1, 1, 0, 0, 0, 1, 1] self.term_counts_matrix[2, :] = [0, 1, 0, 1, 0, 1, 0, 0, 1] # period 2 - base self.term_counts_matrix[0 + (1 * weeks_per_period), :] = [ 0, 0, 0, 1, 0, 0, 0, 0, 1 ] self.term_counts_matrix[3 + (1 * weeks_per_period), :] = [ 0, 0, 1, 1, 0, 1, 1, 0, 1 ] self.term_counts_matrix[7 + (1 * weeks_per_period), :] = [ 0, 1, 0, 1, 1, 0, 0, 0, 1 ] # period 3 - base self.term_counts_matrix[12 + (2 * weeks_per_period), :] = [ 0, 0, 0, 1, 0, 0, 0, 0, 1 ] self.term_counts_matrix[20 + (2 * weeks_per_period), :] = [ 0, 0, 1, 1, 0, 1, 1, 0, 1 ] self.term_counts_matrix[30 + (2 * weeks_per_period), :] = [ 0, 1, 0, 0, 1, 0, 0, 0, 1 ] # period 4 - active self.term_counts_matrix[5 + (3 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[8 + (3 * weeks_per_period), :] = [ 0, 0, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[9 + (3 * weeks_per_period), :] = [ 0, 1, 0, 1, 1, 0, 0, 0, 0 ] # period 5 - active self.term_counts_matrix[20 + (4 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[40 + (4 * weeks_per_period), :] = [ 0, 0, 1, 1, 0, 1, 1, 0, 0 ] self.term_counts_matrix[51 + (4 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 1, 0, 0, 0 ] # period 6 - active self.term_counts_matrix[10 + (5 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[11 + (5 * weeks_per_period), :] = [ 1, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[12 + (5 * weeks_per_period), :] = [ 0, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 7 - active self.term_counts_matrix[21 + (6 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[32 + (6 * weeks_per_period), :] = [ 0, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[43 + (6 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 8 - active self.term_counts_matrix[12 + (7 * weeks_per_period), :] = [ 1, 1, 0, 0, 0, 0, 0, 0, 0 ] self.term_counts_matrix[13 + (7 * weeks_per_period), :] = [ 1, 0, 1, 1, 0, 1, 1, 0, 0 ] self.term_counts_matrix[14 + (7 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 9 - active self.term_counts_matrix[28 + (8 * weeks_per_period), :] = [ 1, 1, 0, 0, 0, 0, 0, 0, 0 ] self.term_counts_matrix[29 + (8 * weeks_per_period), :] = [ 1, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[51 + (8 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 10 - active self.term_counts_matrix[49 + (9 * weeks_per_period), :] = [ 1, 1, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[50 + (9 * weeks_per_period), :] = [ 1, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[51 + (9 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 1, 0 ] self.term_counts_per_week_csc = csc_matrix(self.term_counts_matrix) self.num_patents_per_week = self.term_counts_matrix.sum(axis=1) > 0 self.num_patents_per_week = self.num_patents_per_week.astype( dtype=np.int32) self.em = Emergence(self.num_patents_per_week)
def __init__( self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, curves=True, nterms=50, minimum_patents_per_quarter=20, ): # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if pickled_tfidf_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text( text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted( [x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features( self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print( f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) self.__dates = scripts.utils.date_utils.generate_year_week_dates( dataframe, docs_mask_dict['date_header']) base_pickle_path = path.join('outputs', 'tfidf') makedirs(base_pickle_path, exist_ok=True) def pickle_object(short_name, obj): folder_name = path.join(base_pickle_path, output_name + f'-mdf-{max_df}') makedirs(folder_name, exist_ok=True) file_name = path.join( folder_name, output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2') with bz2.BZ2File(file_name, 'wb') as pickle_file: pickle.dump(obj, pickle_file, protocol=4, fix_imports=False) pickle_object('tfidf', self.__tfidf_obj) pickle_object('dates', self.__dates) pickle_object('cpc_dict', self.__cpc_dict) else: print( f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}' ) base_folder = path.basename(pickled_tfidf_folder_name) pickled_base_file_name = path.join(pickled_tfidf_folder_name, base_folder) self.__tfidf_obj = read_pickle(pickled_base_file_name + '-tfidf.pkl.bz2') self.__dates = read_pickle(pickled_base_file_name + '-dates.pkl.bz2') self.__cpc_dict = read_pickle(pickled_base_file_name + '-cpc_dict.pkl.bz2') if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print( f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter( self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global( tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) self.__term_score_tuples = utils.stop_tup( self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return print(f'Creating timeseries matrix...') self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data( self.__dates) [ self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates ] = self.__timeseries_data self.__M = m_steps_ahead term_counts_per_week_csc = self.__term_counts_per_week.tocsc() em = Emergence(self.__number_of_patents_per_week) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): term_ngram = self.__term_ngrams[term_index] row_indices, row_values = utils.get_row_indices_and_values( term_counts_per_week_csc, term_index) if len(row_values) == 0: continue weekly_iso_dates = [ self.__weekly_iso_dates[x] for x in row_indices ] _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly( weekly_iso_dates, row_values) if max(quarterly_values) < minimum_patents_per_quarter: continue if em.init_vars(row_indices, row_values, porter=not curves): escore = em.calculate_escore() if not curves else em.escore2() self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__stationary = utils.stationary_terms(self.__emergence_list, nterms2)
def __init__(self, term_counts_data, m_steps_ahead=5, curves=True, nterms=50, minimum_patents_per_quarter=20, outname=None): self.__M = m_steps_ahead [ self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates ] = term_counts_data term_counts_per_week_csc = self.__term_counts_per_week.tocsc() em = Emergence(self.__number_of_patents_per_week) self.__emergence_list = [] for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): term_ngram = self.__term_ngrams[term_index] row_indices, row_values = utils.get_row_indices_and_values( term_counts_per_week_csc, term_index) if len(row_values) == 0: continue weekly_iso_dates = [ self.__weekly_iso_dates[x] for x in row_indices ] _, quarterly_values = utils.timeseries_weekly_to_quarterly( weekly_iso_dates, row_values) if max(quarterly_values) < minimum_patents_per_quarter: continue if em.init_vars(row_indices, row_values): escore = em.calculate_escore() if not curves else em.escore2() self.__emergence_list.append((term_ngram, escore)) if len(self.__emergence_list) == 0: self.__emergent = [] self.__declining = [] self.__stationary = [] return self.__emergence_list.sort(key=lambda emergence: -emergence[1]) # for tup in self.__emergence_list: # print(tup[0] + ": " + str(tup[1])) self.__emergent = [x[0] for x in self.__emergence_list[:nterms]] self.__declining = [x[0] for x in self.__emergence_list[-nterms:]] zero_pivot_emergence = None last_emergence = self.__emergence_list[0][1] for index, value in enumerate(self.__emergence_list[1:]): if value[1] <= 0.0 < last_emergence: zero_pivot_emergence = index break last_emergence = value[1] stationary_start_index = zero_pivot_emergence - nterms // 2 stationary_end_index = zero_pivot_emergence + nterms // 2 self.__stationary = [ x[0] for x in self.__emergence_list[stationary_start_index:stationary_end_index] ] filename_and_path = path.join('outputs', 'reports', outname + '_emergence.txt') with open(filename_and_path, 'w') as file: print() print('Emergent') file.write('Emergent\n') for tup in self.__emergence_list[:nterms]: print(tup[0] + ": " + str(tup[1])) file.write(tup[0] + ": " + str(tup[1]) + '\n') print() file.write('\n') print('Stationary') file.write('Stationary\n') for tup in self.__emergence_list[ stationary_start_index:stationary_end_index]: print(tup[0] + ": " + str(tup[1])) file.write(tup[0] + ": " + str(tup[1]) + '\n') print() file.write('\n') print('Declining') file.write('Declining' + '\n') for tup in self.__emergence_list[-nterms:]: print(tup[0] + ": " + str(tup[1])) file.write(tup[0] + ": " + str(tup[1]) + '\n') print() file.write('\n')
class EmergenceTests(unittest.TestCase): def find_term_index(self, term): for term_index in range(0, len(self.term_ngrams)): if self.term_ngrams[term_index] == term: return term_index self.fail(f'Failed to find term {term}') @classmethod def setUpClass(cls): [ cls.term_counts_per_week, cls.term_ngrams, cls.num_patents_per_week, cls.week_iso_dates ] = pd.read_pickle( os.path.join('..', 'data', 'USPTO-random-500000-term_counts.pkl.bz2')) cls.all_yearly_dates, cls.all_yearly_values = timeseries_weekly_to_yearly( cls.week_iso_dates, cls.num_patents_per_week) cls.em = Emergence(cls.all_yearly_values) cls.term_counts_per_week_csc_common = cls.term_counts_per_week.tocsc() def extract_yearly_values(self, term_index, all_yearly_dates): row_indices, row_values = get_row_indices_and_values( self.term_counts_per_week_csc, term_index) weekly_iso_dates = [self.week_iso_dates[x] for x in row_indices] non_zero_dates, yearly_values = timeseries_weekly_to_yearly( weekly_iso_dates, row_values) non_zero_dates, yearly_values = fill_missing_zeros( yearly_values, non_zero_dates, all_yearly_dates) return yearly_values def set_up_emergent_term(self): # Aim: # escore = 2 * active period trend + recent trend + mid-year to last year slope # active period trend = (term counts 5+6+7)/(sqrt(total 5) + sqrt(total 6) + sqrt(total 7) # - (term counts 1+2+3)/(sqrt(total 1) + sqrt(total 2) + sqrt(total 3)) # recent trend = 10 * (term count 6+7)/(sqrt(total 6) + sqrt(total 7)) # - (term counts 4 + 5)/(sqrt(total 4) + sqrt(total 5)) # mid-year to last year slope = 10 * ((term counts 7 / sqrt(total 7)) - (term counts 4/sqrt(total 4))) / (7-4) # # Also: emergent if: # term present for >3 years # >7 docs with term # # term records in active / # term records in base # # base term records / # base all records < 15% # single author set... weeks_per_period = 52 self.term_counts_matrix = np.zeros(shape=(weeks_per_period * 10, 9), dtype=np.int) # [0: emergent term, 1: non-emergent due to base, 2: non-emergent constant count, # 3: non-emergent decreasing count, 4: not 10 years data, 5: background, 6: background, # 7: two occurrences over 10 years, 8: all but one term occurs in base] # period 1 - base self.term_counts_matrix[0, :] = [1, 0, 1, 1, 0, 0, 0, 1, 1] self.term_counts_matrix[2, :] = [0, 1, 0, 1, 0, 1, 0, 0, 1] # period 2 - base self.term_counts_matrix[0 + (1 * weeks_per_period), :] = [ 0, 0, 0, 1, 0, 0, 0, 0, 1 ] self.term_counts_matrix[3 + (1 * weeks_per_period), :] = [ 0, 0, 1, 1, 0, 1, 1, 0, 1 ] self.term_counts_matrix[7 + (1 * weeks_per_period), :] = [ 0, 1, 0, 1, 1, 0, 0, 0, 1 ] # period 3 - base self.term_counts_matrix[12 + (2 * weeks_per_period), :] = [ 0, 0, 0, 1, 0, 0, 0, 0, 1 ] self.term_counts_matrix[20 + (2 * weeks_per_period), :] = [ 0, 0, 1, 1, 0, 1, 1, 0, 1 ] self.term_counts_matrix[30 + (2 * weeks_per_period), :] = [ 0, 1, 0, 0, 1, 0, 0, 0, 1 ] # period 4 - active self.term_counts_matrix[5 + (3 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[8 + (3 * weeks_per_period), :] = [ 0, 0, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[9 + (3 * weeks_per_period), :] = [ 0, 1, 0, 1, 1, 0, 0, 0, 0 ] # period 5 - active self.term_counts_matrix[20 + (4 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[40 + (4 * weeks_per_period), :] = [ 0, 0, 1, 1, 0, 1, 1, 0, 0 ] self.term_counts_matrix[51 + (4 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 1, 0, 0, 0 ] # period 6 - active self.term_counts_matrix[10 + (5 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[11 + (5 * weeks_per_period), :] = [ 1, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[12 + (5 * weeks_per_period), :] = [ 0, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 7 - active self.term_counts_matrix[21 + (6 * weeks_per_period), :] = [ 1, 0, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[32 + (6 * weeks_per_period), :] = [ 0, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[43 + (6 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 8 - active self.term_counts_matrix[12 + (7 * weeks_per_period), :] = [ 1, 1, 0, 0, 0, 0, 0, 0, 0 ] self.term_counts_matrix[13 + (7 * weeks_per_period), :] = [ 1, 0, 1, 1, 0, 1, 1, 0, 0 ] self.term_counts_matrix[14 + (7 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 9 - active self.term_counts_matrix[28 + (8 * weeks_per_period), :] = [ 1, 1, 0, 0, 0, 0, 0, 0, 0 ] self.term_counts_matrix[29 + (8 * weeks_per_period), :] = [ 1, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[51 + (8 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 0, 0 ] # period 10 - active self.term_counts_matrix[49 + (9 * weeks_per_period), :] = [ 1, 1, 0, 1, 0, 0, 0, 0, 0 ] self.term_counts_matrix[50 + (9 * weeks_per_period), :] = [ 1, 1, 1, 0, 0, 1, 1, 0, 0 ] self.term_counts_matrix[51 + (9 * weeks_per_period), :] = [ 1, 1, 0, 0, 1, 0, 0, 1, 0 ] self.term_counts_per_week_csc = csc_matrix(self.term_counts_matrix) self.num_patents_per_week = self.term_counts_matrix.sum(axis=1) > 0 self.num_patents_per_week = self.num_patents_per_week.astype( dtype=np.int32) yearly_dates, yearly_values = timeseries_weekly_to_yearly( self.week_iso_dates, self.num_patents_per_week) self.em = Emergence(yearly_values) return yearly_dates def assert_term_escore(self, em_expected, escore_expected, term): term_index = self.find_term_index(term) self.term_counts_per_week_csc = self.term_counts_per_week_csc_common yearly_values = self.extract_yearly_values(term_index, self.all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) escore_actual = self.em.calculate_escore(yearly_values) self.assertEqual(em_expected, potentially_emergent_actual, term + ": em failed") self.assertEqual(escore_expected, escore_actual, term + ": escore failed") def test_term_with_less_than_10_years_data(self): all_yearly_dates = self.set_up_emergent_term() yearly_values = self.extract_yearly_values(4, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) self.assertFalse(potentially_emergent_actual) def test_term_with_less_than_7_occurrences(self): all_yearly_dates = self.set_up_emergent_term() yearly_values = self.extract_yearly_values(7, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) self.assertFalse(potentially_emergent_actual) def test_term_counts_base2all_over_threshold_and_emergent(self): escore_expected = 6.35 all_yearly_dates = self.set_up_emergent_term() self.em.TERM_BASE_RECS_THRESHOLD = 1 yearly_values = self.extract_yearly_values(0, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) escore_actual = self.em.calculate_escore(yearly_values) self.assertTrue(potentially_emergent_actual) self.assertAlmostEqual(escore_expected, escore_actual, places=2) def test_term_counts_base2all_over_threshold_but_not_emergent(self): all_yearly_dates = self.set_up_emergent_term() self.em.TERM_BASE_RECS_THRESHOLD = 1 yearly_values = self.extract_yearly_values(1, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) self.assertFalse(potentially_emergent_actual) def test_term_with_base_but_no_emergent_instances(self): all_yearly_dates = self.set_up_emergent_term() yearly_values = self.extract_yearly_values(8, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) self.assertFalse(potentially_emergent_actual) def test_non_emergent_with_constant_usage_term(self): escore_expected = 0 all_yearly_dates = self.set_up_emergent_term() yearly_values = self.extract_yearly_values(2, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) escore_actual = self.em.calculate_escore(yearly_values) self.assertEqual(potentially_emergent_actual.dtype, np.dtype('bool')) self.assertAlmostEqual(escore_expected, escore_actual, places=2) def test_non_emergent_with_decreasing_usage_term(self): escore_expected = -4.04 all_yearly_dates = self.set_up_emergent_term() self.em.BASE_TERM2ALL_RATIO_THRESHOLD = 1 self.em.ACTIVE2BASE_RATIO_THRESHOLD = 0 yearly_values = self.extract_yearly_values(3, all_yearly_dates) potentially_emergent_actual = self.em.is_emergence_candidate( yearly_values) escore_actual = self.em.calculate_escore(yearly_values) self.assertTrue(potentially_emergent_actual) self.assertAlmostEqual(escore_expected, escore_actual, places=2) def test_3d_image(self): term = '3d image' escore_expected = -1.3383140739474317 em_expected = True self.assert_term_escore(em_expected, escore_expected, term) def test_3d_display(self): term = '3d display' escore_expected = -0.17674809905755776 em_expected = True self.assert_term_escore(em_expected, escore_expected, term) def test_ac_power_supply(self): term = 'ac power supply' escore_expected = -0.19543452810736667 em_expected = True self.assert_term_escore(em_expected, escore_expected, term) def test_acid_molecule(self): term = 'acid molecule' escore_expected = -0.633235199024181 em_expected = False self.assert_term_escore(em_expected, escore_expected, term) def test_acid_molecule_encoding(self): term = 'acid molecule encoding' escore_expected = -0.3469298902297481 em_expected = False self.assert_term_escore(em_expected, escore_expected, term)