Esempio n. 1
0
 def setUpClass(cls):
     [
         cls.term_counts_per_week, cls.term_ngrams,
         cls.num_patents_per_week, cls.week_iso_dates
     ] = pd.read_pickle(
         os.path.join('data', 'USPTO-random-500000-term_counts.pkl.bz2'))
     cls.term_counts_per_week_csc = cls.term_counts_per_week.tocsc()
     cls.em = Emergence(cls.num_patents_per_week)
 def test_emergent_neg_7_15(self):
     # Arrange (add an extra leading 0 for the first 53 week year)
     weekly_values = [10] * 10 + [0] + [5] * self.weeks + [4] * self.weeks + [3] * self.weeks + [2] * self.weeks \
                     + [1] * self.weeks + [0] * self.weeks
     escore_expected = -7 / 15
     # Act
     escore_actual = Emergence.escore_exponential(weekly_values)
     # Assert
     self.assertAlmostEqual(escore_expected,
                            escore_actual,
                            places=self.places)
 def setUpClass(cls):
     [
         cls.term_counts_per_week, cls.term_ngrams,
         cls.num_patents_per_week, cls.week_iso_dates
     ] = pd.read_pickle(
         os.path.join('..', 'data',
                      'USPTO-random-500000-term_counts.pkl.bz2'))
     cls.all_yearly_dates, cls.all_yearly_values = timeseries_weekly_to_yearly(
         cls.week_iso_dates, cls.num_patents_per_week)
     cls.em = Emergence(cls.all_yearly_values)
     cls.term_counts_per_week_csc_common = cls.term_counts_per_week.tocsc()
 def test_emergent_neg_1(self):
     # Arrange
     weekly_values = [10] * 10 + [3] * self.weeks + [0] * self.weeks + [
         0
     ] * self.weeks
     escore_expected = -1
     # Act
     escore_actual = Emergence.escore_exponential(weekly_values)
     # Assert
     self.assertAlmostEqual(escore_expected,
                            escore_actual,
                            places=self.places)
    def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract',
                 cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0,
                 terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5,
                 emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None):

        self.__emergence_index = emergence_index

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__timeseries_date_dict = docs_mask_dict['timeseries_date']
        self.__timeseries_data = []

        self.__emergence_list = []
        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if cached_folder_name is None:
            dataframe = data_factory.get(data_filename)
            utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header)
            utils.remove_empty_documents(dataframe, text_header)

            self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header],
                                               ngram_range=ngram_range,
                                               max_document_frequency=max_df,
                                               tokenizer=LemmaTokenizer())
            tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True)
            self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

            if prefilter_terms != 0:
                tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names)
                term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
                num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples))

                feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]])

                number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset)
                number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                      f'to {number_of_ngrams_after:,}')

            self.__cpc_dict = utils.cpc_dict(dataframe)
            if docs_mask_dict['date_header'] is None:
                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}')
                self.__dates = None
            else:
                self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe,
                                                                                 docs_mask_dict['date_header'])
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')

            utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name)
            utils.pickle_object('dates', self.__dates, self.__cached_folder_name)
            utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name)

        else:
            print(f'Reading document and TFIDF from pickle {cached_folder_name}')

            self.__cached_folder_name = path.join('cached', cached_folder_name)
            self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name)
            self.__dates = utils.unpickle_object('dates', self.__cached_folder_name)
            self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name)

            if self.__dates is not None:
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
                      f'to {max_date // 100}-{(max_date % 100):02d}')

            WordAnalyzer.init(
                tokenizer=LemmaTokenizer(),
                preprocess=lowercase_strip_accents_and_ownership,
                ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
        print(f'Applying documents filter...')
        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict,
                                      self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        # todo: this is another weight function...

        # term weights - embeddings
        print(f'Applying terms filter...')
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.
        print(f'Creating a masked tfidf matrix from filters...')
        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix

        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

        tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates)
        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names)
        self.__timeseries_data = None

        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
                                                  WordAnalyzer.stemmed_stop_word_set_n)

        # todo: no output method; just if statements to call output functions...?
        #  Only supply what they each directly require

        # todo: hence Pipeline then becomes a single function
        if not calculate_timeseries:
            return

        # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix
        print(f'Creating timeseries matrix...')
        if cached_folder_name is None or not (
                path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
                and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
                and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
            self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates)
            [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week,
             self.__weekly_iso_dates] = self.__timeseries_data

            utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name)
            utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name)
            utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name)
        else:
            self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name)
            self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name)
            self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name)
            self.__term_ngrams = self.__tfidf_obj.feature_names

        self.__M = m_steps_ahead

        # TODO: define period from command line, then cascade through the code

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()
        self.__timeseries_quarterly = []
        self.__timeseries_intercept = []
        self.__timeseries_derivatives = []
        self.__timeseries_quarterly_smoothed = []
        self.__term_nonzero_dates = []

        all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
            self.__weekly_iso_dates, self.__number_of_patents_per_week)

        # find indexes for date-range
        min_date = max_date = None
        if self.__timeseries_date_dict is not None:
            min_date = self.__timeseries_date_dict['from']
            max_date = self.__timeseries_date_dict['to']

        min_i = 0
        max_i = len(all_quarters)

        for i, quarter in enumerate(all_quarters):
            if min_date is not None and min_date < quarter:
                break
            min_i = i

        for i, quarter in enumerate(all_quarters):
            if max_date is not None and max_date < quarter:
                break
            max_i = i
        self.__lims = [min_i, max_i]
        self.__timeseries_quarterly_smoothed = None if sma is None else []

        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term',
                               desc='Calculating quarterly timeseries',
                               leave=False, unit_scale=True):
            row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index)
            weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices]
            non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates,
                                                                                                       row_values)
            non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters)
            self.__timeseries_quarterly.append(quarterly_values)

        if emergence_index == 'gradients' or sma == 'kalman':
            if cached_folder_name is None or not (
                    path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
                    and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
                for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term',
                                                         desc='smoothing quarterly timeseries with kalman filter',
                                                         leave=False, unit_scale=True,
                                                         total=len(self.__timeseries_quarterly)):
                    _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing()

                    smooth_series = smooth_series_s[0].tolist()[0]
                    smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
                    self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

                    derivatives = smooth_series_s[1].tolist()[0]
                    self.__timeseries_derivatives.append(derivatives)

                utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name)
                utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name)

            else:
                self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s',
                                                                             self.__cached_folder_name)
                self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name)

        if sma == 'savgol':
            for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term',
                                         desc='savgol smoothing quarterly timeseries',
                                         leave=False, unit_scale=True):
                smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
                smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
                self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

        em = Emergence(all_quarterly_values[min_i:max_i])
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore',
                               leave=False, unit_scale=True):
            if term_weights[term_index] == 0.0:
                continue
            term_ngram = self.__term_ngrams[term_index]

            if self.__timeseries_quarterly_smoothed is not None:
                quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i]
            else:
                quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i]

            if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float(
                    patents_per_quarter_threshold):
                continue

            if emergence_index == 'quadratic':
                escore = em.escore2(quarterly_values)
            elif emergence_index == 'porter':
                if not em.is_emergence_candidate(quarterly_values):
                    continue
                escore = em.calculate_escore(quarterly_values)
            elif emergence_index == 'gradients':
                derivatives = self.__timeseries_derivatives[term_index][min_i:max_i]
                escore = em.net_growth(quarterly_values, derivatives)
            else:
                weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0]
                escore = em.escore_exponential(weekly_values)

            self.__emergence_list.append((term_ngram, escore))

        nterms2 = min(nterms, len(self.__emergence_list))
        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]]
        self.__declining.reverse()
        self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]
Esempio n. 6
0
    def set_up_emergent_term(self):
        # Aim:
        # escore = 2 * active period trend + recent trend + mid-year to last year slope
        # active period trend = (term counts 5+6+7)/(sqrt(total 5) + sqrt(total 6) + sqrt(total 7)
        #   - (term counts 1+2+3)/(sqrt(total 1) + sqrt(total 2) + sqrt(total 3))
        # recent trend = 10 * (term count 6+7)/(sqrt(total 6) + sqrt(total 7))
        #   - (term counts 4 + 5)/(sqrt(total 4) + sqrt(total 5))
        # mid-year to last year slope = 10 * ((term counts 7 / sqrt(total 7)) - (term counts 4/sqrt(total 4))) / (7-4)
        #
        # Also: emergent if:
        # term present for >3 years
        # >7 docs with term
        # # term records in active / # term records in base
        # # base term records / # base all records < 15%
        # single author set...
        weeks_per_period = 52
        self.term_counts_matrix = np.zeros(shape=(weeks_per_period * 10, 9),
                                           dtype=np.int)

        # [0: emergent term, 1: non-emergent due to base, 2: non-emergent constant count,
        #  3: non-emergent decreasing count, 4: not 10 years data, 5: background, 6: background,
        #  7: two occurrences over 10 years, 8: all but one term occurs in base]

        # period 1 - base
        self.term_counts_matrix[0, :] = [1, 0, 1, 1, 0, 0, 0, 1, 1]
        self.term_counts_matrix[2, :] = [0, 1, 0, 1, 0, 1, 0, 0, 1]

        # period 2 - base
        self.term_counts_matrix[0 + (1 * weeks_per_period), :] = [
            0, 0, 0, 1, 0, 0, 0, 0, 1
        ]
        self.term_counts_matrix[3 + (1 * weeks_per_period), :] = [
            0, 0, 1, 1, 0, 1, 1, 0, 1
        ]
        self.term_counts_matrix[7 + (1 * weeks_per_period), :] = [
            0, 1, 0, 1, 1, 0, 0, 0, 1
        ]

        # period 3 - base
        self.term_counts_matrix[12 + (2 * weeks_per_period), :] = [
            0, 0, 0, 1, 0, 0, 0, 0, 1
        ]
        self.term_counts_matrix[20 + (2 * weeks_per_period), :] = [
            0, 0, 1, 1, 0, 1, 1, 0, 1
        ]
        self.term_counts_matrix[30 + (2 * weeks_per_period), :] = [
            0, 1, 0, 0, 1, 0, 0, 0, 1
        ]

        # period 4 - active
        self.term_counts_matrix[5 + (3 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[8 + (3 * weeks_per_period), :] = [
            0, 0, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[9 + (3 * weeks_per_period), :] = [
            0, 1, 0, 1, 1, 0, 0, 0, 0
        ]

        # period 5 - active
        self.term_counts_matrix[20 + (4 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[40 + (4 * weeks_per_period), :] = [
            0, 0, 1, 1, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[51 + (4 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 1, 0, 0, 0
        ]

        # period 6 - active
        self.term_counts_matrix[10 + (5 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[11 + (5 * weeks_per_period), :] = [
            1, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[12 + (5 * weeks_per_period), :] = [
            0, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 7 - active
        self.term_counts_matrix[21 + (6 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[32 + (6 * weeks_per_period), :] = [
            0, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[43 + (6 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 8 - active
        self.term_counts_matrix[12 + (7 * weeks_per_period), :] = [
            1, 1, 0, 0, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[13 + (7 * weeks_per_period), :] = [
            1, 0, 1, 1, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[14 + (7 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 9 - active
        self.term_counts_matrix[28 + (8 * weeks_per_period), :] = [
            1, 1, 0, 0, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[29 + (8 * weeks_per_period), :] = [
            1, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[51 + (8 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 10 - active
        self.term_counts_matrix[49 + (9 * weeks_per_period), :] = [
            1, 1, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[50 + (9 * weeks_per_period), :] = [
            1, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[51 + (9 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 1, 0
        ]

        self.term_counts_per_week_csc = csc_matrix(self.term_counts_matrix)

        self.num_patents_per_week = self.term_counts_matrix.sum(axis=1) > 0
        self.num_patents_per_week = self.num_patents_per_week.astype(
            dtype=np.int32)

        self.em = Emergence(self.num_patents_per_week)
Esempio n. 7
0
    def __init__(
        self,
        data_filename,
        docs_mask_dict,
        pick_method='sum',
        ngram_range=(1, 3),
        text_header='abstract',
        pickled_tfidf_folder_name=None,
        max_df=0.1,
        user_ngrams=None,
        prefilter_terms=0,
        terms_threshold=None,
        output_name=None,
        calculate_timeseries=None,
        m_steps_ahead=5,
        curves=True,
        nterms=50,
        minimum_patents_per_quarter=20,
    ):

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__timeseries_data = []

        self.__emergence_list = []
        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if pickled_tfidf_folder_name is None:

            dataframe = data_factory.get(data_filename)
            utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict,
                          text_header)
            utils.remove_empty_documents(dataframe, text_header)

            self.__tfidf_obj = tfidf_from_text(
                text_series=dataframe[text_header],
                ngram_range=ngram_range,
                max_document_frequency=max_df,
                tokenizer=LemmaTokenizer())
            tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                       ngram_range=ngram_range,
                                       uni_factor=0.8,
                                       unbias=True)
            self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

            if prefilter_terms != 0:
                tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix,
                                               self.__tfidf_obj.feature_names)
                term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(
                    pick_method)
                num_tuples_to_retain = min(prefilter_terms,
                                           len(term_score_tuples))

                feature_subset = sorted(
                    [x[1] for x in term_score_tuples[:num_tuples_to_retain]])

                number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                self.__tfidf_obj = tfidf_subset_from_features(
                    self.__tfidf_obj, feature_subset)
                number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                print(
                    f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                    f'to {number_of_ngrams_after:,}')

            self.__cpc_dict = utils.cpc_dict(dataframe)
            self.__dates = scripts.utils.date_utils.generate_year_week_dates(
                dataframe, docs_mask_dict['date_header'])

            base_pickle_path = path.join('outputs', 'tfidf')
            makedirs(base_pickle_path, exist_ok=True)

            def pickle_object(short_name, obj):
                folder_name = path.join(base_pickle_path,
                                        output_name + f'-mdf-{max_df}')
                makedirs(folder_name, exist_ok=True)
                file_name = path.join(
                    folder_name,
                    output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2')
                with bz2.BZ2File(file_name, 'wb') as pickle_file:
                    pickle.dump(obj,
                                pickle_file,
                                protocol=4,
                                fix_imports=False)

            pickle_object('tfidf', self.__tfidf_obj)
            pickle_object('dates', self.__dates)
            pickle_object('cpc_dict', self.__cpc_dict)

        else:
            print(
                f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}'
            )

            base_folder = path.basename(pickled_tfidf_folder_name)
            pickled_base_file_name = path.join(pickled_tfidf_folder_name,
                                               base_folder)

            self.__tfidf_obj = read_pickle(pickled_base_file_name +
                                           '-tfidf.pkl.bz2')
            self.__dates = read_pickle(pickled_base_file_name +
                                       '-dates.pkl.bz2')
            self.__cpc_dict = read_pickle(pickled_base_file_name +
                                          '-cpc_dict.pkl.bz2')

            if self.__dates is not None:
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                print(
                    f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
                    f'to {max_date // 100}-{(max_date % 100):02d}')

            WordAnalyzer.init(tokenizer=LemmaTokenizer(),
                              preprocess=lowercase_strip_accents_and_ownership,
                              ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
        print(f'Applying documents filter...')
        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(
            self.__dates, docs_mask_dict, self.__cpc_dict,
            self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        # todo: this is another weight function...

        # term weights - embeddings
        print(f'Applying terms filter...')
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names,
                                       user_ngrams,
                                       threshold=terms_threshold)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.
        print(f'Creating a masked tfidf matrix from filters...')
        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                   ngram_range=ngram_range,
                                   uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix

        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

        tfidf_masked, self.__dates = utils.remove_all_null_rows_global(
            tfidf_masked, self.__dates)
        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked,
                                              self.__tfidf_obj.feature_names)
        self.__timeseries_data = None

        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(
            pick_method)
        self.__term_score_tuples = utils.stop_tup(
            self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
            WordAnalyzer.stemmed_stop_word_set_n)

        # todo: no output method; just if statements to call output functions...?
        #  Only supply what they each directly require

        # todo: hence Pipeline then becomes a single function
        if not calculate_timeseries:
            return

        print(f'Creating timeseries matrix...')
        self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(
            self.__dates)
        [
            self.__term_counts_per_week, self.__term_ngrams,
            self.__number_of_patents_per_week, self.__weekly_iso_dates
        ] = self.__timeseries_data

        self.__M = m_steps_ahead

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()

        em = Emergence(self.__number_of_patents_per_week)
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]),
                               unit='term',
                               desc='Calculating eScore',
                               leave=False,
                               unit_scale=True):
            term_ngram = self.__term_ngrams[term_index]
            row_indices, row_values = utils.get_row_indices_and_values(
                term_counts_per_week_csc, term_index)

            if len(row_values) == 0:
                continue

            weekly_iso_dates = [
                self.__weekly_iso_dates[x] for x in row_indices
            ]

            _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
                weekly_iso_dates, row_values)
            if max(quarterly_values) < minimum_patents_per_quarter:
                continue

            if em.init_vars(row_indices, row_values, porter=not curves):
                escore = em.calculate_escore() if not curves else em.escore2()
                self.__emergence_list.append((term_ngram, escore))

        nterms2 = min(nterms, len(self.__emergence_list))
        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]]
        self.__stationary = utils.stationary_terms(self.__emergence_list,
                                                   nterms2)
Esempio n. 8
0
    def __init__(self,
                 term_counts_data,
                 m_steps_ahead=5,
                 curves=True,
                 nterms=50,
                 minimum_patents_per_quarter=20,
                 outname=None):
        self.__M = m_steps_ahead

        [
            self.__term_counts_per_week, self.__term_ngrams,
            self.__number_of_patents_per_week, self.__weekly_iso_dates
        ] = term_counts_data

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()

        em = Emergence(self.__number_of_patents_per_week)
        self.__emergence_list = []
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]),
                               unit='term',
                               desc='Calculating eScore',
                               leave=False,
                               unit_scale=True):
            term_ngram = self.__term_ngrams[term_index]
            row_indices, row_values = utils.get_row_indices_and_values(
                term_counts_per_week_csc, term_index)

            if len(row_values) == 0:
                continue

            weekly_iso_dates = [
                self.__weekly_iso_dates[x] for x in row_indices
            ]

            _, quarterly_values = utils.timeseries_weekly_to_quarterly(
                weekly_iso_dates, row_values)
            if max(quarterly_values) < minimum_patents_per_quarter:
                continue

            if em.init_vars(row_indices, row_values):
                escore = em.calculate_escore() if not curves else em.escore2()
                self.__emergence_list.append((term_ngram, escore))

        if len(self.__emergence_list) == 0:
            self.__emergent = []
            self.__declining = []
            self.__stationary = []
            return

        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        # for tup in self.__emergence_list:
        #     print(tup[0] + ": " + str(tup[1]))

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms:]]

        zero_pivot_emergence = None
        last_emergence = self.__emergence_list[0][1]

        for index, value in enumerate(self.__emergence_list[1:]):
            if value[1] <= 0.0 < last_emergence:
                zero_pivot_emergence = index
                break
            last_emergence = value[1]

        stationary_start_index = zero_pivot_emergence - nterms // 2
        stationary_end_index = zero_pivot_emergence + nterms // 2
        self.__stationary = [
            x[0] for x in
            self.__emergence_list[stationary_start_index:stationary_end_index]
        ]
        filename_and_path = path.join('outputs', 'reports',
                                      outname + '_emergence.txt')
        with open(filename_and_path, 'w') as file:
            print()
            print('Emergent')
            file.write('Emergent\n')
            for tup in self.__emergence_list[:nterms]:
                print(tup[0] + ": " + str(tup[1]))
                file.write(tup[0] + ": " + str(tup[1]) + '\n')
            print()
            file.write('\n')
            print('Stationary')
            file.write('Stationary\n')
            for tup in self.__emergence_list[
                    stationary_start_index:stationary_end_index]:
                print(tup[0] + ": " + str(tup[1]))
                file.write(tup[0] + ": " + str(tup[1]) + '\n')
            print()
            file.write('\n')

            print('Declining')
            file.write('Declining' + '\n')
            for tup in self.__emergence_list[-nterms:]:
                print(tup[0] + ": " + str(tup[1]))
                file.write(tup[0] + ": " + str(tup[1]) + '\n')
            print()
            file.write('\n')
class EmergenceTests(unittest.TestCase):
    def find_term_index(self, term):
        for term_index in range(0, len(self.term_ngrams)):
            if self.term_ngrams[term_index] == term:
                return term_index
        self.fail(f'Failed to find term {term}')

    @classmethod
    def setUpClass(cls):
        [
            cls.term_counts_per_week, cls.term_ngrams,
            cls.num_patents_per_week, cls.week_iso_dates
        ] = pd.read_pickle(
            os.path.join('..', 'data',
                         'USPTO-random-500000-term_counts.pkl.bz2'))
        cls.all_yearly_dates, cls.all_yearly_values = timeseries_weekly_to_yearly(
            cls.week_iso_dates, cls.num_patents_per_week)
        cls.em = Emergence(cls.all_yearly_values)
        cls.term_counts_per_week_csc_common = cls.term_counts_per_week.tocsc()

    def extract_yearly_values(self, term_index, all_yearly_dates):
        row_indices, row_values = get_row_indices_and_values(
            self.term_counts_per_week_csc, term_index)
        weekly_iso_dates = [self.week_iso_dates[x] for x in row_indices]
        non_zero_dates, yearly_values = timeseries_weekly_to_yearly(
            weekly_iso_dates, row_values)
        non_zero_dates, yearly_values = fill_missing_zeros(
            yearly_values, non_zero_dates, all_yearly_dates)
        return yearly_values

    def set_up_emergent_term(self):
        # Aim:
        # escore = 2 * active period trend + recent trend + mid-year to last year slope
        # active period trend = (term counts 5+6+7)/(sqrt(total 5) + sqrt(total 6) + sqrt(total 7)
        #   - (term counts 1+2+3)/(sqrt(total 1) + sqrt(total 2) + sqrt(total 3))
        # recent trend = 10 * (term count 6+7)/(sqrt(total 6) + sqrt(total 7))
        #   - (term counts 4 + 5)/(sqrt(total 4) + sqrt(total 5))
        # mid-year to last year slope = 10 * ((term counts 7 / sqrt(total 7)) - (term counts 4/sqrt(total 4))) / (7-4)
        #
        # Also: emergent if:
        # term present for >3 years
        # >7 docs with term
        # # term records in active / # term records in base
        # # base term records / # base all records < 15%
        # single author set...
        weeks_per_period = 52
        self.term_counts_matrix = np.zeros(shape=(weeks_per_period * 10, 9),
                                           dtype=np.int)

        # [0: emergent term, 1: non-emergent due to base, 2: non-emergent constant count,
        #  3: non-emergent decreasing count, 4: not 10 years data, 5: background, 6: background,
        #  7: two occurrences over 10 years, 8: all but one term occurs in base]

        # period 1 - base
        self.term_counts_matrix[0, :] = [1, 0, 1, 1, 0, 0, 0, 1, 1]
        self.term_counts_matrix[2, :] = [0, 1, 0, 1, 0, 1, 0, 0, 1]

        # period 2 - base
        self.term_counts_matrix[0 + (1 * weeks_per_period), :] = [
            0, 0, 0, 1, 0, 0, 0, 0, 1
        ]
        self.term_counts_matrix[3 + (1 * weeks_per_period), :] = [
            0, 0, 1, 1, 0, 1, 1, 0, 1
        ]
        self.term_counts_matrix[7 + (1 * weeks_per_period), :] = [
            0, 1, 0, 1, 1, 0, 0, 0, 1
        ]

        # period 3 - base
        self.term_counts_matrix[12 + (2 * weeks_per_period), :] = [
            0, 0, 0, 1, 0, 0, 0, 0, 1
        ]
        self.term_counts_matrix[20 + (2 * weeks_per_period), :] = [
            0, 0, 1, 1, 0, 1, 1, 0, 1
        ]
        self.term_counts_matrix[30 + (2 * weeks_per_period), :] = [
            0, 1, 0, 0, 1, 0, 0, 0, 1
        ]

        # period 4 - active
        self.term_counts_matrix[5 + (3 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[8 + (3 * weeks_per_period), :] = [
            0, 0, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[9 + (3 * weeks_per_period), :] = [
            0, 1, 0, 1, 1, 0, 0, 0, 0
        ]

        # period 5 - active
        self.term_counts_matrix[20 + (4 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[40 + (4 * weeks_per_period), :] = [
            0, 0, 1, 1, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[51 + (4 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 1, 0, 0, 0
        ]

        # period 6 - active
        self.term_counts_matrix[10 + (5 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[11 + (5 * weeks_per_period), :] = [
            1, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[12 + (5 * weeks_per_period), :] = [
            0, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 7 - active
        self.term_counts_matrix[21 + (6 * weeks_per_period), :] = [
            1, 0, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[32 + (6 * weeks_per_period), :] = [
            0, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[43 + (6 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 8 - active
        self.term_counts_matrix[12 + (7 * weeks_per_period), :] = [
            1, 1, 0, 0, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[13 + (7 * weeks_per_period), :] = [
            1, 0, 1, 1, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[14 + (7 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 9 - active
        self.term_counts_matrix[28 + (8 * weeks_per_period), :] = [
            1, 1, 0, 0, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[29 + (8 * weeks_per_period), :] = [
            1, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[51 + (8 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 0, 0
        ]

        # period 10 - active
        self.term_counts_matrix[49 + (9 * weeks_per_period), :] = [
            1, 1, 0, 1, 0, 0, 0, 0, 0
        ]
        self.term_counts_matrix[50 + (9 * weeks_per_period), :] = [
            1, 1, 1, 0, 0, 1, 1, 0, 0
        ]
        self.term_counts_matrix[51 + (9 * weeks_per_period), :] = [
            1, 1, 0, 0, 1, 0, 0, 1, 0
        ]

        self.term_counts_per_week_csc = csc_matrix(self.term_counts_matrix)

        self.num_patents_per_week = self.term_counts_matrix.sum(axis=1) > 0
        self.num_patents_per_week = self.num_patents_per_week.astype(
            dtype=np.int32)

        yearly_dates, yearly_values = timeseries_weekly_to_yearly(
            self.week_iso_dates, self.num_patents_per_week)

        self.em = Emergence(yearly_values)

        return yearly_dates

    def assert_term_escore(self, em_expected, escore_expected, term):
        term_index = self.find_term_index(term)
        self.term_counts_per_week_csc = self.term_counts_per_week_csc_common

        yearly_values = self.extract_yearly_values(term_index,
                                                   self.all_yearly_dates)

        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        escore_actual = self.em.calculate_escore(yearly_values)

        self.assertEqual(em_expected, potentially_emergent_actual,
                         term + ": em failed")
        self.assertEqual(escore_expected, escore_actual,
                         term + ": escore failed")

    def test_term_with_less_than_10_years_data(self):
        all_yearly_dates = self.set_up_emergent_term()
        yearly_values = self.extract_yearly_values(4, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        self.assertFalse(potentially_emergent_actual)

    def test_term_with_less_than_7_occurrences(self):
        all_yearly_dates = self.set_up_emergent_term()
        yearly_values = self.extract_yearly_values(7, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        self.assertFalse(potentially_emergent_actual)

    def test_term_counts_base2all_over_threshold_and_emergent(self):
        escore_expected = 6.35
        all_yearly_dates = self.set_up_emergent_term()
        self.em.TERM_BASE_RECS_THRESHOLD = 1
        yearly_values = self.extract_yearly_values(0, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        escore_actual = self.em.calculate_escore(yearly_values)
        self.assertTrue(potentially_emergent_actual)
        self.assertAlmostEqual(escore_expected, escore_actual, places=2)

    def test_term_counts_base2all_over_threshold_but_not_emergent(self):
        all_yearly_dates = self.set_up_emergent_term()
        self.em.TERM_BASE_RECS_THRESHOLD = 1
        yearly_values = self.extract_yearly_values(1, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        self.assertFalse(potentially_emergent_actual)

    def test_term_with_base_but_no_emergent_instances(self):
        all_yearly_dates = self.set_up_emergent_term()
        yearly_values = self.extract_yearly_values(8, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        self.assertFalse(potentially_emergent_actual)

    def test_non_emergent_with_constant_usage_term(self):
        escore_expected = 0
        all_yearly_dates = self.set_up_emergent_term()
        yearly_values = self.extract_yearly_values(2, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        escore_actual = self.em.calculate_escore(yearly_values)
        self.assertEqual(potentially_emergent_actual.dtype, np.dtype('bool'))
        self.assertAlmostEqual(escore_expected, escore_actual, places=2)

    def test_non_emergent_with_decreasing_usage_term(self):
        escore_expected = -4.04
        all_yearly_dates = self.set_up_emergent_term()
        self.em.BASE_TERM2ALL_RATIO_THRESHOLD = 1
        self.em.ACTIVE2BASE_RATIO_THRESHOLD = 0
        yearly_values = self.extract_yearly_values(3, all_yearly_dates)
        potentially_emergent_actual = self.em.is_emergence_candidate(
            yearly_values)
        escore_actual = self.em.calculate_escore(yearly_values)
        self.assertTrue(potentially_emergent_actual)
        self.assertAlmostEqual(escore_expected, escore_actual, places=2)

    def test_3d_image(self):
        term = '3d image'
        escore_expected = -1.3383140739474317
        em_expected = True
        self.assert_term_escore(em_expected, escore_expected, term)

    def test_3d_display(self):
        term = '3d display'
        escore_expected = -0.17674809905755776
        em_expected = True
        self.assert_term_escore(em_expected, escore_expected, term)

    def test_ac_power_supply(self):
        term = 'ac power supply'
        escore_expected = -0.19543452810736667
        em_expected = True
        self.assert_term_escore(em_expected, escore_expected, term)

    def test_acid_molecule(self):
        term = 'acid molecule'
        escore_expected = -0.633235199024181
        em_expected = False
        self.assert_term_escore(em_expected, escore_expected, term)

    def test_acid_molecule_encoding(self):
        term = 'acid molecule encoding'
        escore_expected = -0.3469298902297481
        em_expected = False
        self.assert_term_escore(em_expected, escore_expected, term)