Python get Examples

Programming Language: Python

Namespace/Package Name: scripts.data_factory

Method/Function: get

Examples at hotexamples.com: 6

Python get - 6 examples found. These are the top rated real world Python examples of scripts.data_factory.get extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: pipeline.py Project: uk-gov-mirror/datasciencecampus.pygrams

    def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range=(1, 3), text_header='abstract',
                 cached_folder_name=None, max_df=0.1, user_ngrams=None, prefilter_terms=0,
                 terms_threshold=None, output_name=None, calculate_timeseries=None, m_steps_ahead=5,
                 emergence_index='porter', exponential=False, nterms=50, patents_per_quarter_threshold=20, sma=None):

        self.__emergence_index = emergence_index

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__timeseries_date_dict = docs_mask_dict['timeseries_date']
        self.__timeseries_data = []

        self.__emergence_list = []
        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if cached_folder_name is None:
            dataframe = data_factory.get(data_filename)
            utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict, text_header)
            utils.remove_empty_documents(dataframe, text_header)

            self.__tfidf_obj = tfidf_from_text(text_series=dataframe[text_header],
                                               ngram_range=ngram_range,
                                               max_document_frequency=max_df,
                                               tokenizer=LemmaTokenizer())
            tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8, unbias=True)
            self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

            if prefilter_terms != 0:
                tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix, self.__tfidf_obj.feature_names)
                term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
                num_tuples_to_retain = min(prefilter_terms, len(term_score_tuples))

                feature_subset = sorted([x[1] for x in term_score_tuples[:num_tuples_to_retain]])

                number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                self.__tfidf_obj = tfidf_subset_from_features(self.__tfidf_obj, feature_subset)
                number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                print(f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                      f'to {number_of_ngrams_after:,}')

            self.__cpc_dict = utils.cpc_dict(dataframe)
            if docs_mask_dict['date_header'] is None:
                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}')
                self.__dates = None
            else:
                self.__dates = scripts.utils.date_utils.generate_year_week_dates(dataframe,
                                                                                 docs_mask_dict['date_header'])
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                self.__cached_folder_name = path.join('cached', output_name + f'-mdf-{max_df}-{min_date}-{max_date}')

            utils.pickle_object('tfidf', self.__tfidf_obj, self.__cached_folder_name)
            utils.pickle_object('dates', self.__dates, self.__cached_folder_name)
            utils.pickle_object('cpc_dict', self.__cpc_dict, self.__cached_folder_name)

        else:
            print(f'Reading document and TFIDF from pickle {cached_folder_name}')

            self.__cached_folder_name = path.join('cached', cached_folder_name)
            self.__tfidf_obj = utils.unpickle_object('tfidf', self.__cached_folder_name)
            self.__dates = utils.unpickle_object('dates', self.__cached_folder_name)
            self.__cpc_dict = utils.unpickle_object('cpc_dict', self.__cached_folder_name)

            if self.__dates is not None:
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                print(f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
                      f'to {max_date // 100}-{(max_date % 100):02d}')

            WordAnalyzer.init(
                tokenizer=LemmaTokenizer(),
                preprocess=lowercase_strip_accents_and_ownership,
                ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
        print(f'Applying documents filter...')
        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(self.__dates, docs_mask_dict, self.__cpc_dict,
                                      self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        # todo: this is another weight function...

        # term weights - embeddings
        print(f'Applying terms filter...')
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names, user_ngrams, threshold=terms_threshold)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.
        print(f'Creating a masked tfidf matrix from filters...')
        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj, ngram_range=ngram_range, uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix

        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

        tfidf_masked, self.__dates = utils.remove_all_null_rows_global(tfidf_masked, self.__dates)
        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked, self.__tfidf_obj.feature_names)
        self.__timeseries_data = None

        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method)
        self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
                                                  WordAnalyzer.stemmed_stop_word_set_n)

        # todo: no output method; just if statements to call output functions...?
        #  Only supply what they each directly require

        # todo: hence Pipeline then becomes a single function
        if not calculate_timeseries:
            return

        # TODO: offer timeseries cache as an option. Then filter dates and terms after reading the cached matrix
        print(f'Creating timeseries matrix...')
        if cached_folder_name is None or not (
                path.isfile(utils.pickle_name('weekly_series_terms', self.__cached_folder_name))
                and path.isfile(utils.pickle_name('weekly_series_global', self.__cached_folder_name))
                and path.isfile(utils.pickle_name('weekly_isodates', self.__cached_folder_name))):
            self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(self.__dates)
            [self.__term_counts_per_week, self.__term_ngrams, self.__number_of_patents_per_week,
             self.__weekly_iso_dates] = self.__timeseries_data

            utils.pickle_object('weekly_series_terms', self.__term_counts_per_week, self.__cached_folder_name)
            utils.pickle_object('weekly_series_global', self.__number_of_patents_per_week, self.__cached_folder_name)
            utils.pickle_object('weekly_isodates', self.__weekly_iso_dates, self.__cached_folder_name)
        else:
            self.__term_counts_per_week = utils.unpickle_object('weekly_series_terms', self.__cached_folder_name)
            self.__number_of_patents_per_week = utils.unpickle_object('weekly_series_global', self.__cached_folder_name)
            self.__weekly_iso_dates = utils.unpickle_object('weekly_isodates', self.__cached_folder_name)
            self.__term_ngrams = self.__tfidf_obj.feature_names

        self.__M = m_steps_ahead

        # TODO: define period from command line, then cascade through the code

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()
        self.__timeseries_quarterly = []
        self.__timeseries_intercept = []
        self.__timeseries_derivatives = []
        self.__timeseries_quarterly_smoothed = []
        self.__term_nonzero_dates = []

        all_quarters, all_quarterly_values = self.__x = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
            self.__weekly_iso_dates, self.__number_of_patents_per_week)

        # find indexes for date-range
        min_date = max_date = None
        if self.__timeseries_date_dict is not None:
            min_date = self.__timeseries_date_dict['from']
            max_date = self.__timeseries_date_dict['to']

        min_i = 0
        max_i = len(all_quarters)

        for i, quarter in enumerate(all_quarters):
            if min_date is not None and min_date < quarter:
                break
            min_i = i

        for i, quarter in enumerate(all_quarters):
            if max_date is not None and max_date < quarter:
                break
            max_i = i
        self.__lims = [min_i, max_i]
        self.__timeseries_quarterly_smoothed = None if sma is None else []

        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term',
                               desc='Calculating quarterly timeseries',
                               leave=False, unit_scale=True):
            row_indices, row_values = utils.get_row_indices_and_values(term_counts_per_week_csc, term_index)
            weekly_iso_dates = [self.__weekly_iso_dates[x] for x in row_indices]
            non_zero_dates, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(weekly_iso_dates,
                                                                                                       row_values)
            non_zero_dates, quarterly_values = utils.fill_missing_zeros(quarterly_values, non_zero_dates, all_quarters)
            self.__timeseries_quarterly.append(quarterly_values)

        if emergence_index == 'gradients' or sma == 'kalman':
            if cached_folder_name is None or not (
                    path.isfile(utils.pickle_name('smooth_series_s', self.__cached_folder_name))
                    and path.isfile(utils.pickle_name('derivatives', self.__cached_folder_name))):
                for term_index, quarterly_values in tqdm(enumerate(self.__timeseries_quarterly), unit='term',
                                                         desc='smoothing quarterly timeseries with kalman filter',
                                                         leave=False, unit_scale=True,
                                                         total=len(self.__timeseries_quarterly)):
                    _, _1, smooth_series_s, _intercept = StateSpaceModel(quarterly_values).run_smoothing()

                    smooth_series = smooth_series_s[0].tolist()[0]
                    smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
                    self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

                    derivatives = smooth_series_s[1].tolist()[0]
                    self.__timeseries_derivatives.append(derivatives)

                utils.pickle_object('smooth_series_s', self.__timeseries_quarterly_smoothed, self.__cached_folder_name)
                utils.pickle_object('derivatives', self.__timeseries_derivatives, self.__cached_folder_name)

            else:
                self.__timeseries_quarterly_smoothed = utils.unpickle_object('smooth_series_s',
                                                                             self.__cached_folder_name)
                self.__timeseries_derivatives = utils.unpickle_object('derivatives', self.__cached_folder_name)

        if sma == 'savgol':
            for quarterly_values in tqdm(self.__timeseries_quarterly, unit='term',
                                         desc='savgol smoothing quarterly timeseries',
                                         leave=False, unit_scale=True):
                smooth_series = savgol_filter(quarterly_values, 9, 2, mode='nearest')
                smooth_series_no_negatives = np.clip(smooth_series, a_min=0, a_max=None)
                self.__timeseries_quarterly_smoothed.append(smooth_series_no_negatives.tolist())

        em = Emergence(all_quarterly_values[min_i:max_i])
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]), unit='term', desc='Calculating eScore',
                               leave=False, unit_scale=True):
            if term_weights[term_index] == 0.0:
                continue
            term_ngram = self.__term_ngrams[term_index]

            if self.__timeseries_quarterly_smoothed is not None:
                quarterly_values = list(self.__timeseries_quarterly_smoothed[term_index])[min_i:max_i]
            else:
                quarterly_values = list(self.__timeseries_quarterly[term_index])[min_i:max_i]

            if len(quarterly_values) == 0 or max(list(self.__timeseries_quarterly[term_index][min_i:max_i])) < float(
                    patents_per_quarter_threshold):
                continue

            if emergence_index == 'quadratic':
                escore = em.escore2(quarterly_values)
            elif emergence_index == 'porter':
                if not em.is_emergence_candidate(quarterly_values):
                    continue
                escore = em.calculate_escore(quarterly_values)
            elif emergence_index == 'gradients':
                derivatives = self.__timeseries_derivatives[term_index][min_i:max_i]
                escore = em.net_growth(quarterly_values, derivatives)
            else:
                weekly_values = term_counts_per_week_csc.getcol(term_index).todense().ravel().tolist()[0]
                escore = em.escore_exponential(weekly_values)

            self.__emergence_list.append((term_ngram, escore))

        nterms2 = min(nterms, len(self.__emergence_list))
        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]]
        self.__declining.reverse()
        self.__stationary = [x[0] for x in utils.stationary_terms(self.__emergence_list, nterms2)]

Example #2

Show file

File: pipeline.py Project: momor666/pyGrams

    def __init__(
        self,
        data_filename,
        docs_mask_dict,
        pick_method='sum',
        ngram_range=(1, 3),
        text_header='abstract',
        pickled_tfidf_folder_name=None,
        max_df=0.1,
        user_ngrams=None,
        prefilter_terms=0,
        terms_threshold=None,
        output_name=None,
        calculate_timeseries=None,
        m_steps_ahead=5,
        curves=True,
        nterms=50,
        minimum_patents_per_quarter=20,
    ):

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__timeseries_data = []

        self.__emergence_list = []
        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if pickled_tfidf_folder_name is None:

            dataframe = data_factory.get(data_filename)
            utils.checkdf(dataframe, calculate_timeseries, docs_mask_dict,
                          text_header)
            utils.remove_empty_documents(dataframe, text_header)

            self.__tfidf_obj = tfidf_from_text(
                text_series=dataframe[text_header],
                ngram_range=ngram_range,
                max_document_frequency=max_df,
                tokenizer=LemmaTokenizer())
            tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                       ngram_range=ngram_range,
                                       uni_factor=0.8,
                                       unbias=True)
            self.__tfidf_obj.apply_weights(tfidf_mask_obj.tfidf_mask)

            if prefilter_terms != 0:
                tfidf_reduce_obj = TfidfReduce(self.__tfidf_obj.tfidf_matrix,
                                               self.__tfidf_obj.feature_names)
                term_score_tuples = tfidf_reduce_obj.extract_ngrams_from_docset(
                    pick_method)
                num_tuples_to_retain = min(prefilter_terms,
                                           len(term_score_tuples))

                feature_subset = sorted(
                    [x[1] for x in term_score_tuples[:num_tuples_to_retain]])

                number_of_ngrams_before = len(self.__tfidf_obj.feature_names)
                self.__tfidf_obj = tfidf_subset_from_features(
                    self.__tfidf_obj, feature_subset)
                number_of_ngrams_after = len(self.__tfidf_obj.feature_names)
                print(
                    f'Reduced number of terms by pre-filtering from {number_of_ngrams_before:,} '
                    f'to {number_of_ngrams_after:,}')

            self.__cpc_dict = utils.cpc_dict(dataframe)
            self.__dates = scripts.utils.date_utils.generate_year_week_dates(
                dataframe, docs_mask_dict['date_header'])

            base_pickle_path = path.join('outputs', 'tfidf')
            makedirs(base_pickle_path, exist_ok=True)

            def pickle_object(short_name, obj):
                folder_name = path.join(base_pickle_path,
                                        output_name + f'-mdf-{max_df}')
                makedirs(folder_name, exist_ok=True)
                file_name = path.join(
                    folder_name,
                    output_name + f'-mdf-{max_df}-{short_name}.pkl.bz2')
                with bz2.BZ2File(file_name, 'wb') as pickle_file:
                    pickle.dump(obj,
                                pickle_file,
                                protocol=4,
                                fix_imports=False)

            pickle_object('tfidf', self.__tfidf_obj)
            pickle_object('dates', self.__dates)
            pickle_object('cpc_dict', self.__cpc_dict)

        else:
            print(
                f'Reading document and TFIDF from pickle {pickled_tfidf_folder_name}'
            )

            base_folder = path.basename(pickled_tfidf_folder_name)
            pickled_base_file_name = path.join(pickled_tfidf_folder_name,
                                               base_folder)

            self.__tfidf_obj = read_pickle(pickled_base_file_name +
                                           '-tfidf.pkl.bz2')
            self.__dates = read_pickle(pickled_base_file_name +
                                       '-dates.pkl.bz2')
            self.__cpc_dict = read_pickle(pickled_base_file_name +
                                          '-cpc_dict.pkl.bz2')

            if self.__dates is not None:
                min_date = min(self.__dates)
                max_date = max(self.__dates)
                print(
                    f'Document year-week dates range from {min_date // 100}-{(min_date % 100):02d} '
                    f'to {max_date // 100}-{(max_date % 100):02d}')

            WordAnalyzer.init(tokenizer=LemmaTokenizer(),
                              preprocess=lowercase_strip_accents_and_ownership,
                              ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place
        print(f'Applying documents filter...')
        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(
            self.__dates, docs_mask_dict, self.__cpc_dict,
            self.__tfidf_obj.tfidf_matrix.shape[0]).doc_filters

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        # todo: this is another weight function...

        # term weights - embeddings
        print(f'Applying terms filter...')
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names,
                                       user_ngrams,
                                       threshold=terms_threshold)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.
        print(f'Creating a masked tfidf matrix from filters...')
        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                   ngram_range=ngram_range,
                                   uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_filters, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix

        tfidf_masked = tfidf_mask.multiply(self.__tfidf_obj.tfidf_matrix)

        tfidf_masked, self.__dates = utils.remove_all_null_rows_global(
            tfidf_masked, self.__dates)
        print(f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,}'
              f' / {self.__tfidf_obj.tfidf_matrix.shape[0]:,} documents')

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked,
                                              self.__tfidf_obj.feature_names)
        self.__timeseries_data = None

        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(
            pick_method)
        self.__term_score_tuples = utils.stop_tup(
            self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
            WordAnalyzer.stemmed_stop_word_set_n)

        # todo: no output method; just if statements to call output functions...?
        #  Only supply what they each directly require

        # todo: hence Pipeline then becomes a single function
        if not calculate_timeseries:
            return

        print(f'Creating timeseries matrix...')
        self.__timeseries_data = self.__tfidf_reduce_obj.create_timeseries_data(
            self.__dates)
        [
            self.__term_counts_per_week, self.__term_ngrams,
            self.__number_of_patents_per_week, self.__weekly_iso_dates
        ] = self.__timeseries_data

        self.__M = m_steps_ahead

        term_counts_per_week_csc = self.__term_counts_per_week.tocsc()

        em = Emergence(self.__number_of_patents_per_week)
        for term_index in tqdm(range(self.__term_counts_per_week.shape[1]),
                               unit='term',
                               desc='Calculating eScore',
                               leave=False,
                               unit_scale=True):
            term_ngram = self.__term_ngrams[term_index]
            row_indices, row_values = utils.get_row_indices_and_values(
                term_counts_per_week_csc, term_index)

            if len(row_values) == 0:
                continue

            weekly_iso_dates = [
                self.__weekly_iso_dates[x] for x in row_indices
            ]

            _, quarterly_values = scripts.utils.date_utils.timeseries_weekly_to_quarterly(
                weekly_iso_dates, row_values)
            if max(quarterly_values) < minimum_patents_per_quarter:
                continue

            if em.init_vars(row_indices, row_values, porter=not curves):
                escore = em.calculate_escore() if not curves else em.escore2()
                self.__emergence_list.append((term_ngram, escore))

        nterms2 = min(nterms, len(self.__emergence_list))
        self.__emergence_list.sort(key=lambda emergence: -emergence[1])

        self.__emergent = [x[0] for x in self.__emergence_list[:nterms2]]
        self.__declining = [x[0] for x in self.__emergence_list[-nterms2:]]
        self.__stationary = utils.stationary_terms(self.__emergence_list,
                                                   nterms2)

Example #3

Show file

File: test_data_factory.py Project: uk-gov-mirror/datasciencecampus.pygrams

 def test_reads_pickles(self):
     df = factory.get('data/USPTO-random-100.pkl.bz2')
     self.assertEquals(len(df['abstract']), 100)

Example #4

Show file

File: test_data_factory.py Project: uk-gov-mirror/datasciencecampus.pygrams

 def test_reads_csv(self):
     df = factory.get('data/USPTO-random-100.csv')
     self.assertListEqual(list(self.__df['abstract']), list(df['abstract']))

Example #5

Show file

File: test_data_factory.py Project: momor666/pyGrams

 def test_reads_xlsx(self):
     df = factory.get('tests/data/USPTO-random-100.xlsx')
     self.assertListEqual(list(self.__df['abstract']), list(df['abstract']))

Example #6

Show file

File: pipeline.py Project: mitroulias77/pyGrams

    def __init__(self,
                 data_filename,
                 docs_mask_dict,
                 pick_method='sum',
                 ngram_range=(1, 3),
                 normalize_rows=False,
                 text_header='abstract',
                 term_counts=False,
                 pickled_tf_idf_file_name=None,
                 max_df=0.1,
                 user_ngrams=None,
                 output_name=None,
                 emerging_technology=None):

        # load data
        self.__data_filename = data_filename
        self.__date_dict = docs_mask_dict['date']
        self.__time = docs_mask_dict['time']

        self.__pick_method = pick_method
        # calculate or fetch tf-idf mat
        if pickled_tf_idf_file_name is None:

            self.__dataframe = datafactory.get(data_filename)
            checkdf(self.__dataframe, emerging_technology, docs_mask_dict,
                    text_header, term_counts)

            remove_empty_documents(self.__dataframe, text_header)
            self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header],
                                     ngram_range=ngram_range,
                                     max_document_frequency=max_df,
                                     tokenizer=LemmaTokenizer())

            self.__text_lengths = self.__dataframe[text_header].map(
                len).tolist()
            self.__dataframe.drop(columns=[text_header], inplace=True)

            tfidf_filename = path.join(
                'outputs', 'tfidf',
                output_name + f'-tfidf-mdf-{max_df}.pkl.bz2')
            makedirs(path.dirname(tfidf_filename), exist_ok=True)
            with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file:
                pickle.dump(
                    (self.__tfidf_obj, self.__dataframe, self.__text_lengths),
                    pickle_file,
                    protocol=4)

        else:
            print(
                f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}'
            )
            self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(
                pickled_tf_idf_file_name)
            if docs_mask_dict['date_header'] is None:
                print('Document dates not specified')
            else:
                min_date = min(self.__dataframe[docs_mask_dict['date_header']])
                max_date = max(self.__dataframe[docs_mask_dict['date_header']])
                print(
                    f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}'
                )

            WordAnalyzer.init(tokenizer=LemmaTokenizer(),
                              preprocess=lowercase_strip_accents_and_ownership,
                              ngram_range=ngram_range)

        # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep
        #  the original. We're really just filtering down.

        # todo: build up a list of functions to apply as document filters. all filters to have common args (c/o
        #  partialfunc if required) so we can then call them in sequence...
        #  from a combiner.
        #  each func just returns an array of bool (or 0/1)
        #  if union - create union combiner, else create intersection combiner. combiner = union if... else intersection
        #  weights = combiner(list of funcs, data set)
        #  combiner: if list is empty, return [1] * size; if single entry, return its array
        #  union: if more entries after single, add / or
        #  intersection: if more entries after single, multiple / and
        #  then apply mask to tfidf object and df (i.e. remove rows with false or 0); do this in place

        # docs weights( column, dates subset + time, citations etc.)
        doc_filters = DocumentsFilter(self.__dataframe,
                                      docs_mask_dict).doc_weights

        # todo: build up list of weight functions (left with single remaining arg etc via partialfunc)
        #  combine(list, tfidf) => multiplies weights together, then multiplies across tfidf (if empty, no side effect)

        doc_weights = DocumentsWeights(self.__dataframe,
                                       docs_mask_dict['time'],
                                       docs_mask_dict['cite'],
                                       docs_mask_dict['date_header'],
                                       self.__text_lengths,
                                       norm_rows=normalize_rows).weights
        doc_weights = [a * b for a, b in zip(doc_filters, doc_weights)]

        # todo: this is another weight function...

        # term weights - embeddings
        filter_terms_obj = FilterTerms(self.__tfidf_obj.feature_names,
                                       user_ngrams,
                                       threshold=0.75)
        term_weights = filter_terms_obj.ngram_weights_vec

        # todo: replace tfidf_mask with isolated functions: clean_unigrams, unbias_ngrams;
        #  these operate directly on tfidf
        #  Hence return nothing - operate in place on tfidf.

        # tfidf mask ( doc_ids, doc_weights, embeddings_filter will all merge to a single mask in the future)
        tfidf_mask_obj = TfidfMask(self.__tfidf_obj,
                                   ngram_range=ngram_range,
                                   uni_factor=0.8)
        tfidf_mask_obj.update_mask(doc_weights, term_weights)
        tfidf_mask = tfidf_mask_obj.tfidf_mask

        # todo: this mutiply and remove null will disappear - maybe put weight combiner last so it can remove 0 weights
        # mask the tfidf matrix
        tfidf_matrix = self.__tfidf_obj.tfidf_matrix
        tfidf_masked = tfidf_mask.multiply(tfidf_matrix)

        tfidf_masked = utils.remove_all_null_rows(tfidf_masked)

        print(
            f'Processing TFIDF matrix of {tfidf_masked.shape[0]:,} / {tfidf_matrix.shape[0]:,} documents'
        )

        # todo: no advantage in classes - just create term_count and extract_ngrams as functions

        self.__tfidf_reduce_obj = TfidfReduce(tfidf_masked,
                                              self.__tfidf_obj.feature_names)
        self.__term_counts_data = None
        if term_counts or emerging_technology:
            self.__term_counts_data = self.__tfidf_reduce_obj.create_terms_count(
                self.__dataframe, docs_mask_dict['date_header'])
        # if other outputs
        self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(
            pick_method)
        self.__term_score_tuples = utils.stop_tup(
            self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni,
            WordAnalyzer.stemmed_stop_word_set_n)