def test_WordAnalyser_unigrams_with_punctuation(self): ngram_range = (1, 1) WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range) doc = "Some test words, to ignore except-hyphens but including someone's ownership" expected_ngrams = ['test', 'words', 'ignore', 'except-hyphens', 'ownership'] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_unigrams_with_stopwords(self): ngram_range = (1, 1) WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range) doc = 'Some test words to ignore safely' expected_ngrams = ['test', 'words', 'ignore', 'safely'] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_ngrams_dont_cross_punctuation_or_stop_words(self): ngram_range = (1, 3) WordAnalyzer.init(tokenizer=self.word_tokenizer, preprocess=self.preprocess, ngram_range=ngram_range) doc = "Some test words, except-hyphens metal but someone's metal fish bucket" expected_ngrams = ['test', 'words', 'except-hyphens', 'metal', 'metal', 'fish', 'bucket', 'test words', 'except-hyphens metal', 'metal fish', 'fish bucket', 'metal fish bucket'] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)
def test_WordAnalyser_ngrams(self): ngram_range = (1, 3) WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=self.preprocess, ngram_range=ngram_range) doc = "Conductive structures in features of an insulator layer on a substrate are fabricated by a particular " \ "process. In this process, a layer of conductive material is applied over the insulator layer so that " \ "the layer of conductive material covers field regions adjacent the features and fills in the features " \ "themselves. A grain size differential between the conductive material which covers the field regions " \ "and the conductive material which fills in the feature is then established by annealing the layer of " \ "conductive material. Excess conductive material is then removed to uncover the field regions and leave " \ "the conductive structures. The layer of conductive material is applied so as to define a first layer " \ "thickness over the field regions and a second layer thickness in and over the features. These " \ "thicknesses are dimensioned such that d 1 ≦0.5d 2 , with d 1 being the first layer thickness and d 2 " \ "being the second layer thickness. Preferably, the first and second layer thicknesses are dimensioned " \ "such that d 1 ≦0.3d 2 . " expected_ngrams = [ 'conductive', 'structure', 'feature', 'insulator', 'layer', 'substrate', 'fabricate', 'particular', 'process', 'process', 'layer', 'conductive', 'material', 'apply', 'insulator', 'layer', 'layer', 'conductive', 'material', 'field', 'region', 'feature', 'fill', 'feature', 'themselves', 'grain', 'differential', 'conductive', 'material', 'field', 'region', 'conductive', 'material', 'fill', 'feature', 'establish', 'anneal', 'layer', 'conductive', 'material', 'conductive', 'material', 'remove', 'uncover', 'field', 'region', 'leave', 'conductive', 'structure', 'layer', 'conductive', 'material', 'apply', 'define', 'first', 'layer', 'thickness', 'field', 'region', 'second', 'layer', 'thickness', 'feature', 'thickness', 'dimension', '0.5d', 'first', 'layer', 'thickness', 'second', 'layer', 'thickness', 'preferably', 'first', 'second', 'layer', 'thickness', 'dimension', '0.3d', 'conductive structure', 'insulator layer', 'particular process', 'conductive material', 'insulator layer', 'conductive material', 'material cover', 'cover field', 'field region', 'region adjacent', 'feature themselves', 'conductive material', 'field region', 'conductive material', 'conductive material', 'conductive material', 'field region', 'conductive structure', 'conductive material', 'layer thickness', 'field region', 'layer thickness', 'layer thickness', 'layer thickness', 'layer thickness', 'conductive material cover', 'material cover field', 'cover field region', 'field region adjacent' ] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams)
def tfidf_from_text(text_series, ngram_range=(1, 3), max_document_frequency=0.3, tokenizer=StemTokenizer()): WordAnalyzer.init(tokenizer=tokenizer, preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) #TODO add option to increase uint8 to uint16 or 32 on user vectorizer = CountVectorizer(max_df=max_document_frequency, min_df=1, ngram_range=ngram_range, analyzer=WordAnalyzer.analyzer, dtype=np.uint8) count_matrix = vectorizer.fit_transform(text_series) feature_names = vectorizer.get_feature_names() return _TFIDF(count_matrix, vectorizer.vocabulary_, feature_names)
def __init__(self, text_series, ngram_range=(1, 3), max_document_frequency=0.3, tokenizer=StemTokenizer()): WordAnalyzer.init(tokenizer=tokenizer, preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) self.__vectorizer = CountVectorizer(max_df=max_document_frequency, min_df=1, ngram_range=ngram_range, analyzer=WordAnalyzer.analyzer) self.__ngram_counts = self.__vectorizer.fit_transform(text_series) self.__feature_names = self.__vectorizer.get_feature_names() self.__tfidf_transformer = TfidfTransformer(smooth_idf=False) self.__tfidf_matrix = self.__tfidf_transformer.fit_transform( self.__ngram_counts)
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None): self.__emergence_index = emergence_index # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_date_dict = docs_mask_dict['timeseries_date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if cached_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) if docs_mask_dict['date_header'] is None: self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}') self.__dates = None else: self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe, docs_mask_dict['date_header']) min_date = min(self.__dates) max_date = max(self.__dates) self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}') utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name) utils.pickle_object('dates', self.__dates, self.__cached_folder_name) utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name) else: print(f'Reading document and TFIDF from pickle {cached_folder_name}') self.__cached_folder_name = path.join('cached', cached_folder_name) self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name) self.__dates = utils.unpickle_object('dates', self.__cached_folder_name) self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name) if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init( tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix print(f'Creating timeseries matrix...') if cached_folder_name is None or not ( path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name)) and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))): self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates) [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates] = self.__timeseries_data utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name) utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name) utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name) else: self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name) self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name) self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name) self.__term_ngrams = self.__tfidf_obj.feature_names self.__M = m_steps_ahead # TODO: define period from command line, then cascade through the code term_counts_per_week_csc = self.__term_counts_per_week.tocsc() self.__timeseries_quarterly = [] self.__timeseries_intercept = [] self.__timeseries_derivatives = [] self.__timeseries_quarterly_smoothed = [] self.__term_nonzero_dates = [] all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly( self.__weekly_iso_dates, self.__number_of_patents_per_week) # find indexes for date-range min_date = max_date = None if self.__timeseries_date_dict is not None: min_date = self.__timeseries_date_dict['from'] max_date = self.__timeseries_date_dict['to'] min_i = 0 max_i = len(all_quarters) for i, quarter in enumerate(all_quarters): if min_date is not None and min_date < quarter: break min_i = i for i, quarter in enumerate(all_quarters): if max_date is not None and max_date < quarter: break max_i = i self.__lims = [min_i, max_i] self.__timeseries_quarterly_smoothed = None if sma is None else [] for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating quarterly timeseries', leave=False, unit_scale=True): row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index) weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices] non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates, row_values) non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters) self.__timeseries_quarterly.append(quarterly_values) if emergence_index == 'gradients' or sma == 'kalman': if cached_folder_name is None or not ( path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name)) and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))): for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term', desc='smoothing quarterly timeseries with kalman filter', leave=False, unit_scale=True, total=len(self.__timeseries_quarterly)): _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing() smooth_series = smooth_series_s[0].tolist()[0] smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) derivatives = smooth_series_s[1].tolist()[0] self.__timeseries_derivatives.append(derivatives) utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name) utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name) else: self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s', self.__cached_folder_name) self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name) if sma == 'savgol': for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term', desc='savgol smoothing quarterly timeseries', leave=False, unit_scale=True): smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest') smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None) self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist()) em = Emergence(all_quarterly_values[min_i:max_i]) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): if term_weights[term_index] == 0.0: continue term_ngram = self.__term_ngrams[term_index] if self.__timeseries_quarterly_smoothed is not None: quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i] else: quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i] if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float( patents_per_quarter_threshold): continue if emergence_index == 'quadratic': escore = em.escore2(quarterly_values) elif emergence_index == 'porter': if not em.is_emergence_candidate(quarterly_values): continue escore = em.calculate_escore(quarterly_values) elif emergence_index == 'gradients': derivatives = self.__timeseries_derivatives[term_index][min_i:max_i] escore = em.net_growth(quarterly_values, derivatives) else: weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0] escore = em.escore_exponential(weekly_values) self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__declining.reverse() self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]
def __init__( self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract', pickled_tfidf_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0, terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5, curves=True, nterms=50, minimum_patents_per_quarter=20, ): # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__timeseries_data = [] self.__emergence_list = [] self.__pick_method = pick_method # calculate or fetch tf-idf mat if pickled_tfidf_folder_name is None: dataframe = data_factory.get(data_filename) utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header) utils.remove_empty_documents(dataframe, text_header) self.__tfidf_obj = tfidf_from_text( text_series=dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True) self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask) if prefilter_terms != 0: tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names) term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples)) feature_subset = sorted( [x[1] for x in term_score_tuples[:num_tuples_to_retain]]) number_of_ngrams_before = len(self.__tfidf_obj.feature_names) self.__tfidf_obj = tfidf_subset_from_features( self.__tfidf_obj, feature_subset) number_of_ngrams_after = len(self.__tfidf_obj.feature_names) print( f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} ' f'to {number_of_ngrams_after:,}') self.__cpc_dict = utils.cpc_dict(dataframe) self.__dates = scripts.utils.date_utils.generate_year_week_dates( dataframe, docs_mask_dict['date_header']) base_pickle_path = path.join('outputs', 'tfidf') makedirs(base_pickle_path, exist_ok=True) def pickle_object(short_name, obj): folder_name = path.join(base_pickle_path, output_name + f'-mdf-{max_df}') makedirs(folder_name, exist_ok=True) file_name = path.join( folder_name, output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2') with bz2.BZ2File(file_name, 'wb') as pickle_file: pickle.dump(obj, pickle_file, protocol=4, fix_imports=False) pickle_object('tfidf', self.__tfidf_obj) pickle_object('dates', self.__dates) pickle_object('cpc_dict', self.__cpc_dict) else: print( f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}' ) base_folder = path.basename(pickled_tfidf_folder_name) pickled_base_file_name = path.join(pickled_tfidf_folder_name, base_folder) self.__tfidf_obj = read_pickle(pickled_base_file_name + '-tfidf.pkl.bz2') self.__dates = read_pickle(pickled_base_file_name + '-dates.pkl.bz2') self.__cpc_dict = read_pickle(pickled_base_file_name + '-cpc_dict.pkl.bz2') if self.__dates is not None: min_date = min(self.__dates) max_date = max(self.__dates) print( f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} ' f'to {max_date // 100}-{(max_date % 100):02d}') WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place print(f'Applying documents filter...') # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter( self.__dates, docs_mask_dict, self.__cpc_dict, self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) # todo: this is another weight function... # term weights - embeddings print(f'Applying terms filter...') filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. print(f'Creating a masked tfidf matrix from filters...') # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_filters, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix) tfidf_masked, self.__dates = utils.remove_all_null_rows_global( tfidf_masked, self.__dates) print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}' f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents') # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__timeseries_data = None # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) self.__term_score_tuples = utils.stop_tup( self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require # todo: hence Pipeline then becomes a single function if not calculate_timeseries: return print(f'Creating timeseries matrix...') self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data( self.__dates) [ self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week, self.__weekly_iso_dates ] = self.__timeseries_data self.__M = m_steps_ahead term_counts_per_week_csc = self.__term_counts_per_week.tocsc() em = Emergence(self.__number_of_patents_per_week) for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore', leave=False, unit_scale=True): term_ngram = self.__term_ngrams[term_index] row_indices, row_values = utils.get_row_indices_and_values( term_counts_per_week_csc, term_index) if len(row_values) == 0: continue weekly_iso_dates = [ self.__weekly_iso_dates[x] for x in row_indices ] _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly( weekly_iso_dates, row_values) if max(quarterly_values) < minimum_patents_per_quarter: continue if em.init_vars(row_indices, row_values, porter=not curves): escore = em.calculate_escore() if not curves else em.escore2() self.__emergence_list.append((term_ngram, escore)) nterms2 = min(nterms, len(self.__emergence_list)) self.__emergence_list.sort(key=lambda emergence: -emergence[1]) self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]] self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]] self.__stationary = utils.stationary_terms(self.__emergence_list, nterms2)
def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), normalize_rows=False, text_header='abstract', term_counts=False, pickled_tf_idf_file_name=None, max_df=0.1, user_ngrams=None, output_name=None, emerging_technology=None): # load data self.__data_filename = data_filename self.__date_dict = docs_mask_dict['date'] self.__time = docs_mask_dict['time'] self.__pick_method = pick_method # calculate or fetch tf-idf mat if pickled_tf_idf_file_name is None: self.__dataframe = datafactory.get(data_filename) checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts) remove_empty_documents(self.__dataframe, text_header) self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range, max_document_frequency=max_df, tokenizer=LemmaTokenizer()) self.__text_lengths = self.__dataframe[text_header].map( len).tolist() self.__dataframe.drop(columns=[text_header], inplace=True) tfidf_filename = path.join( 'outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2') makedirs(path.dirname(tfidf_filename), exist_ok=True) with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file: pickle.dump( (self.__tfidf_obj, self.__dataframe, self.__text_lengths), pickle_file, protocol=4) else: print( f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}' ) self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle( pickled_tf_idf_file_name) if docs_mask_dict['date_header'] is None: print('Document dates not specified') else: min_date = min(self.__dataframe[docs_mask_dict['date_header']]) max_date = max(self.__dataframe[docs_mask_dict['date_header']]) print( f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}' ) WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=lowercase_strip_accents_and_ownership, ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o # partialfunc if required) so we can then call them in sequence... # from a combiner. # each func just returns an array of bool (or 0/1) # if union - create union combiner, else create intersection combiner. combiner = union if... else intersection # weights = combiner(list of funcs, data set) # combiner: if list is empty, return [1] * size; if single entry, return its array # union: if more entries after single, add / or # intersection: if more entries after single, multiple / and # then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place # docs weights( column, dates subset + time, citations etc.) doc_filters = DocumentsFilter(self.__dataframe, docs_mask_dict).doc_weights # todo: build up list of weight functions (left with single remaining arg etc via partialfunc) # combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect) doc_weights = DocumentsWeights(self.__dataframe, docs_mask_dict['time'], docs_mask_dict['cite'], docs_mask_dict['date_header'], self.__text_lengths, norm_rows=normalize_rows).weights doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)] # todo: this is another weight function... # term weights - embeddings filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=0.75) term_weights = filter_terms_obj.ngram_weights_vec # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams; # these operate directly on tfidf # Hence return nothing - operate in place on tfidf. # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future) tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8) tfidf_mask_obj.update_mask(doc_weights, term_weights) tfidf_mask = tfidf_mask_obj.tfidf_mask # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights # mask the tfidf matrix tfidf_matrix = self.__tfidf_obj.tfidf_matrix tfidf_masked = tfidf_mask.multiply(tfidf_matrix) tfidf_masked = utils.remove_all_null_rows(tfidf_masked) print( f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents' ) # todo: no advantage in classes - just create term_count and extract_ngrams as functions self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names) self.__term_counts_data = None if term_counts or emerging_technology: self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count( self.__dataframe, docs_mask_dict['date_header']) # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset( pick_method) self.__term_score_tuples = utils.stop_tup( self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n)