Beispiel #1
0
    def setUpClass(cls):
        num_ngrams = 50
        min_n = 2
        max_n = 3
        max_df=0.3
        ngram_range = (min_n, max_n)

        df = pd.read_pickle(FilePaths.us_patents_random_1000_pickle_name)
        tfidf_obj = TFIDF(df['abstract'], ngram_range=ngram_range, max_document_frequency=max_df,
                          tokenizer=StemTokenizer())

        doc_weights = list(np.ones(len(df)))

        # term weights - embeddings
        filter_output_obj = FilterTerms(tfidf_obj.feature_names, None, None)
        term_weights = filter_output_obj.ngram_weights_vec

        tfidf_mask_obj = TfidfMask(tfidf_obj, ngram_range=ngram_range)
        tfidf_mask_obj.update_mask(doc_weights, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # mask the tfidf matrix
        tfidf_matrix = tfidf_obj.tfidf_matrix
        tfidf_masked = tfidf_mask.multiply(tfidf_matrix)
        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents')

        cls.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, tfidf_obj.feature_names)
        term_score_tuples = cls.__tfidf_reduce_obj.extract_ngrams_from_docset('sum')
        graph_obj = TermsGraph(term_score_tuples[:num_ngrams], cls.__tfidf_reduce_obj)
        graph = graph_obj.graph
        cls.__links = graph['links']
        cls.__nodes = graph['nodes']
    def test_terms(self):
        self.init_mask('Y02', 2)
        expected_terms = ['variabl turbocharg',
                          'acceler drive region',
                          'compress air',
                          'compress extern air',
                          'compressor rotat',
                          'control divid',
                          'cylind gener power',
                          'deceler drive region',
                          'engin control system',
                          'exhaust ga exhaust',
                          'exhaust ga suppli',
                          'fuel amount suppli',
                          'fuel inject',
                          'inject time',
                          'oper region',
                          'steady-spe drive region',
                          'turbin rotat',
                          'vane adjust flow',
                          'drive region',
                          'exhaust ga']

        tfidf_matrix = self.__tfidf_obj.tfidf_matrix
        tfidf_masked = self.__tfidf_mask.multiply(tfidf_matrix)
        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

        tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names)
        term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_row(0)
        term_score_tuples.sort(key=lambda tup: (-tup[0], tup[1]))
        actual_terms = [x for _, x in term_score_tuples]

        self.assertEqual(expected_terms, actual_terms)
    def test_non_zeros_clean_rows(self):
        self.init_mask('Y02', 2)
        tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
        vocabulary = self.__tfidf_obj.vocabulary
        expected_term1_val = 0.25
        expected_term2_val = 0.2962962962962961

        term1 = 'exhaust ga'  # 0.25
        term2 = 'drive region'  # 0.2962962962962961
        idx_term1 = vocabulary.get(term1)
        idx_term2 = vocabulary.get(term2)

        indexof_idx_term1 = tfidf_mask_nozero_rows.indices.tolist().index(idx_term1)
        indexof_idx_term2 = tfidf_mask_nozero_rows.indices.tolist().index(idx_term2)

        actual_values = list(tfidf_mask_nozero_rows.data)

        self.assertEqual(expected_term1_val, actual_values[indexof_idx_term1])
        self.assertAlmostEqual(expected_term2_val, actual_values[indexof_idx_term2])
Beispiel #4
0
    def __init__(self,
                 data_filename,
                 docs_mask_dict,
                 pick_method='sum',
                 ngram_range=(1, 3),
                 normalize_rows=False,
                 text_header='abstract',
                 term_counts=False,
                 pickled_tf_idf_file_name=None,
                 max_df=0.1,
                 user_ngrams=None,
                 output_name=None,
                 emerging_technology=None):

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__time = docs_mask_dict['time']

        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if pickled_tf_idf_file_name is None:

            self.__dataframe = datafactory.get(data_filename)
            checkdf(self.__dataframe, emerging_technology, docs_mask_dict,
                    text_header, term_counts)

            remove_empty_documents(self.__dataframe, text_header)
            self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header],
                                     ngram_range=ngram_range,
                                     max_document_frequency=max_df,
                                     tokenizer=LemmaTokenizer())

            self.__text_lengths = self.__dataframe[text_header].map(
                len).tolist()
            self.__dataframe.drop(columns=[text_header], inplace=True)

            tfidf_filename = path.join(
                'outputs', 'tfidf',
                output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
            makedirs(path.dirname(tfidf_filename), exist_ok=True)
            with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
                pickle.dump(
                    (self.__tfidf_obj, self.__dataframe, self.__text_lengths),
                    pickle_file,
                    protocol=4)

        else:
            print(
                f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}'
            )
            self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(
                pickled_tf_idf_file_name)
            if docs_mask_dict['date_header'] is None:
                print('Document dates not specified')
            else:
                min_date = min(self.__dataframe[docs_mask_dict['date_header']])
                max_date = max(self.__dataframe[docs_mask_dict['date_header']])
                print(
                    f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}'
                )

            WordAnalyzer.init(tokenizer=LemmaTokenizer(),
                              preprocess=lowercase_strip_accents_and_ownership,
                              ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place

        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(self.__dataframe,
                                      docs_mask_dict).doc_weights

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        doc_weights = DocumentsWeights(self.__dataframe,
                                       docs_mask_dict['time'],
                                       docs_mask_dict['cite'],
                                       docs_mask_dict['date_header'],
                                       self.__text_lengths,
                                       norm_rows=normalize_rows).weights
        doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)]

        # todo: this is another weight function...

        # term weights - embeddings
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names,
                                       user_ngrams,
                                       threshold=0.75)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.

        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                   ngram_range=ngram_range,
                                   uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_weights, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix
        tfidf_matrix = self.__tfidf_obj.tfidf_matrix
        tfidf_masked = tfidf_mask.multiply(tfidf_matrix)

        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

        print(
            f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents'
        )

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked,
                                              self.__tfidf_obj.feature_names)
        self.__term_counts_data = None
        if term_counts or emerging_technology:
            self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count(
                self.__dataframe, docs_mask_dict['date_header'])
        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(
            pick_method)
        self.__term_score_tuples = utils.stop_tup(
            self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
            WordAnalyzer.stemmed_stop_word_set_n)
 def test_num_non_zeros_clean_rows_clean_unigrams(self):
     self.init_mask('Y02', 1, uni_factor=0.4)
     tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
     self.assertEqual(26, len(tfidf_mask_nozero_rows.data))
 def test_num_non_zeros_clean_rows(self):
     self.init_mask('Y02', 2)
     tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask)
     self.assertEqual(20, len(tfidf_mask_nozero_rows.data))