コード例 #1
0
	def test_get_scores_zero_all_same(self):
		cat_counts = np.array([0, 0, 0, 0, 0, 0, 1, 2])
		not_cat_counts = np.array([1, 1, 2, 1, 1, 1, 1, 2])
		scores = CornerScore.get_scores(cat_counts, not_cat_counts)
		np.testing.assert_almost_equal(scores,
		                               np.array([0.5, 0.5, 0.15625, 0.5, 0.5,
		                                         0.5, 0.8391308, 0.6685437]))
コード例 #2
0
	def get_scores_for_category(self):
		cat_counts, not_cat_counts = self._get_counts()
		scores = CornerScore.get_scores_for_category(cat_counts, not_cat_counts)
		np.testing.assert_almost_equal(scores,
		                               np.array([0.9300538, 1.0198039,
		                                         0.9300538, 0.9055385, 0.2,
		                                         0.7433034, 0.585235, 0.9861541,
		                                         0.9861541, 0.3605551]))
コード例 #3
0
    def get_corner_scores(self, category):
        ''' Computes corner score, which is inversely correlated
        to the Rudder score to the nearest upper-left or lower-right corner.
        Parameters
        ----------
        category : str
            category name to score

        Returns
        -------
            np.array
        '''
        return CornerScore.get_scores(
            *self._get_catetgory_and_non_category_word_counts(category)
        )
コード例 #4
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                transform=percentile_alphabetical,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                background_scorer=None):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        title_case_names : bool, default False
          Title case category name and no-category name?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        background_scorer : CharacteristicScorer, optional
            Used for bg scores

        Returns
        -------
        Dictionary that encodes the scatter chart
        information. The dictionary can be dumped as a json document, and
        used in scattertext.html
         {info: {category_name: ..., not_category_name},
          data: [{term:,
                  x:frequency [0-1],
                  y:frequency [0-1],
                  ox: score,
                  oy: score,
                  s: score,
                  os: original score,
                  p: p-val,
                  cat25k: freq per 25k in category,
                  cat: count in category,
                  ncat: count in non-category,
                  catdocs: [docnum, ...],
                  ncatdocs: [docnum, ...]
                  ncat25k: freq per 25k in non-category}, ...]}}

        '''
        if self.used:
            raise Exception("Cannot reuse a ScatterChart constructor")

        all_categories = self.term_doc_matrix.get_categories()
        assert category in all_categories

        if not_categories is None:
            not_categories = [c for c in all_categories if c != category]
            neutral_categories = []
            extra_categories = []
        elif neutral_categories is None:
            neutral_categories = [
                c for c in all_categories
                if c not in [category] + not_categories
            ]
            extra_categories = []
        elif extra_categories is None:
            extra_categories = [
                c for c in all_categories
                if c not in [category] + not_categories + neutral_categories
            ]
        all_categories = [
            category
        ] + not_categories + neutral_categories + extra_categories

        df = self._get_term_category_frequencies()

        self._add_x_and_y_coords_to_term_df_if_injected(df)

        if scores is None:
            scores = self._get_default_scores(category, not_categories, df)
        category_column_name = category + ' freq'
        df['category score'] = CornerScore.get_scores_for_category(
            df[category_column_name],
            df[[c + ' freq' for c in not_categories]].sum(axis=1))
        if self.scatterchartdata.term_significance is not None:
            df['p'] = get_p_vals(df, category_column_name,
                                 self.scatterchartdata.term_significance)
        df['not category score'] = CornerScore.get_scores_for_category(
            df[[c + ' freq' for c in not_categories]].sum(axis=1),
            df[category_column_name])
        df['color_scores'] = scores
        if self.scatterchartdata.terms_to_include is None:
            df = self._filter_bigrams_by_minimum_not_category_term_freq(
                category_column_name, not_categories, df)
            df = filter_bigrams_by_pmis(
                self._filter_by_minimum_term_frequency(all_categories, df),
                threshold_coef=self.scatterchartdata.pmi_threshold_coefficient)

        if self.scatterchartdata.filter_unigrams:
            df = filter_out_unigrams_that_only_occur_in_one_bigram(df)
        if len(df) == 0:
            raise NoWordMeetsTermFrequencyRequirementsError()
        df['category score rank'] = rankdata(df['category score'],
                                             method='ordinal')
        df['not category score rank'] = rankdata(df['not category score'],
                                                 method='ordinal')
        if self.scatterchartdata.max_terms and self.scatterchartdata.max_terms < len(
                df):
            assert self.scatterchartdata.max_terms > 0
            df = self._limit_max_terms(category, df)
        df = df.reset_index()

        if self.x_coords is None:
            self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \
                (category, df, not_categories, transform)
            df['x'], df['y'] = self.x_coords, self.y_coords
            df['ox'], df['oy'] = self.x_coords, self.y_coords

        df['not cat freq'] = df[[x + ' freq'
                                 for x in not_categories]].sum(axis=1)
        if neutral_categories != []:
            df['neut cat freq'] = df[[x + ' freq' for x in neutral_categories
                                      ]].sum(axis=1).fillna(0)
        if extra_categories != []:
            df['extra cat freq'] = df[[x + ' freq' for x in extra_categories
                                       ]].sum(axis=1).fillna(0)

        json_df = df[['x', 'y', 'ox', 'oy', 'term']]

        if self.scatterchartdata.term_significance:
            json_df['p'] = df['p']
        self._add_term_freq_to_json_df(json_df, df, category)
        json_df['s'] = percentile_min(df['color_scores'])
        json_df['os'] = df['color_scores']
        if background_scorer:
            bg_scores = background_scorer.get_scores(self.term_doc_matrix)
            json_df['bg'] = bg_scores[1].loc[json_df.term].values
        elif not self.scatterchartdata.use_non_text_features:
            json_df['bg'] = self._get_corpus_characteristic_scores(json_df)

        self._preform_axis_rescale(json_df, self._rescale_x, 'x')
        self._preform_axis_rescale(json_df, self._rescale_y, 'y')

        if self.scatterchartdata.terms_to_include is not None:
            json_df = self._use_only_selected_terms(json_df)

        category_terms = list(json_df.sort_values('s')['term'][:10])
        not_category_terms = list(json_df.sort_values('s')['term'][:10])
        if category_name is None:
            category_name = category
        if not_category_name is None:
            not_category_name = 'Not ' + category_name

        def better_title(x):
            if title_case_names:
                return ' '.join(
                    [t[0].upper() + t[1:].lower() for t in x.split()])
            else:
                return x

        j = {
            'info': {
                'category_name': better_title(category_name),
                'not_category_name': better_title(not_category_name),
                'category_terms': category_terms,
                'not_category_terms': not_category_terms,
                'category_internal_name': category,
                'not_category_internal_names': not_categories,
                'categories': self.term_doc_matrix.get_categories(),
                'neutral_category_internal_names': neutral_categories,
                'extra_category_internal_names': extra_categories
            }
        }
        if self.metadata_term_lists is not None:
            j['metalists'] = self.metadata_term_lists
        if self.metadata_descriptions is not None:
            j['metadescriptions'] = self.metadata_descriptions
        if self.term_colors is not None:
            j['info']['term_colors'] = self.term_colors
        #j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(orient='records')
        j['data'] = json_df.to_dict(orient='records')

        return j
コード例 #5
0
	def test_get_scores(self):
		cat_counts, not_cat_counts = self._get_counts()
		scores = CornerScore.get_scores(cat_counts, not_cat_counts)
		np.testing.assert_almost_equal(scores,
		                               np.array([0.1820027, 0.2828427, 0.1820027, 0.5, 0.9292893,
		                                         0.2378287, 0.7930882, 0.1845603, 0.1845603, 0.8725245]))
コード例 #6
0
	def test_get_scores_zero_median(self):
		cat_counts = np.array([0, 0, 0, 0, 0, 0, 1, 2])
		not_cat_counts = np.array([1, 1, 2, 1, 1, 1, 1, 3])
		CornerScore.get_scores(cat_counts, not_cat_counts)