Example #1
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                transform=percentile_alphabetical,
                title_case_names=False,
                not_categories=None,
                neutral_categories=None,
                extra_categories=None,
                background_scorer=None):
        '''

        Parameters
        ----------
        category : str
            Category to annotate.  Exact value of category.
        category_name : str, optional
            Name of category which will appear on web site. Default None is same as category.
        not_category_name : str, optional
            Name of ~category which will appear on web site. Default None is same as "not " + category.
        scores : np.array, optional
            Scores to use for coloring.  Defaults to None, or RankDifference scores
        transform : function, optional
            Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
        title_case_names : bool, default False
          Title case category name and no-category name?
        not_categories : list, optional
            List of categories to use as "not category".  Defaults to all others.
        neutral_categories : list, optional
            List of categories to use as neutral.  Defaults [].
        extra_categories : list, optional
            List of categories to use as extra.  Defaults [].
        background_scorer : CharacteristicScorer, optional
            Used for bg scores

        Returns
        -------
        Dictionary that encodes the scatter chart
        information. The dictionary can be dumped as a json document, and
        used in scattertext.html
         {info: {category_name: ..., not_category_name},
          data: [{term:,
                  x:frequency [0-1],
                  y:frequency [0-1],
                  ox: score,
                  oy: score,
                  s: score,
                  os: original score,
                  p: p-val,
                  cat25k: freq per 25k in category,
                  cat: count in category,
                  ncat: count in non-category,
                  catdocs: [docnum, ...],
                  ncatdocs: [docnum, ...]
                  ncat25k: freq per 25k in non-category}, ...]}}

        '''
        if self.used:
            raise Exception("Cannot reuse a ScatterChart constructor")

        all_categories = self.term_doc_matrix.get_categories()
        assert category in all_categories

        if not_categories is None:
            not_categories = [c for c in all_categories if c != category]
            neutral_categories = []
            extra_categories = []
        elif neutral_categories is None:
            neutral_categories = [
                c for c in all_categories
                if c not in [category] + not_categories
            ]
            extra_categories = []
        elif extra_categories is None:
            extra_categories = [
                c for c in all_categories
                if c not in [category] + not_categories + neutral_categories
            ]
        all_categories = [
            category
        ] + not_categories + neutral_categories + extra_categories

        df = self._get_term_category_frequencies()

        self._add_x_and_y_coords_to_term_df_if_injected(df)

        if scores is None:
            scores = self._get_default_scores(category, not_categories, df)
        category_column_name = category + ' freq'
        df['category score'] = CornerScore.get_scores_for_category(
            df[category_column_name],
            df[[c + ' freq' for c in not_categories]].sum(axis=1))
        if self.scatterchartdata.term_significance is not None:
            df['p'] = get_p_vals(df, category_column_name,
                                 self.scatterchartdata.term_significance)
        df['not category score'] = CornerScore.get_scores_for_category(
            df[[c + ' freq' for c in not_categories]].sum(axis=1),
            df[category_column_name])
        df['color_scores'] = scores
        if self.scatterchartdata.terms_to_include is None:
            df = self._filter_bigrams_by_minimum_not_category_term_freq(
                category_column_name, not_categories, df)
            df = filter_bigrams_by_pmis(
                self._filter_by_minimum_term_frequency(all_categories, df),
                threshold_coef=self.scatterchartdata.pmi_threshold_coefficient)

        if self.scatterchartdata.filter_unigrams:
            df = filter_out_unigrams_that_only_occur_in_one_bigram(df)
        if len(df) == 0:
            raise NoWordMeetsTermFrequencyRequirementsError()
        df['category score rank'] = rankdata(df['category score'],
                                             method='ordinal')
        df['not category score rank'] = rankdata(df['not category score'],
                                                 method='ordinal')
        if self.scatterchartdata.max_terms and self.scatterchartdata.max_terms < len(
                df):
            assert self.scatterchartdata.max_terms > 0
            df = self._limit_max_terms(category, df)
        df = df.reset_index()

        if self.x_coords is None:
            self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \
                (category, df, not_categories, transform)
            df['x'], df['y'] = self.x_coords, self.y_coords
            df['ox'], df['oy'] = self.x_coords, self.y_coords

        df['not cat freq'] = df[[x + ' freq'
                                 for x in not_categories]].sum(axis=1)
        if neutral_categories != []:
            df['neut cat freq'] = df[[x + ' freq' for x in neutral_categories
                                      ]].sum(axis=1).fillna(0)
        if extra_categories != []:
            df['extra cat freq'] = df[[x + ' freq' for x in extra_categories
                                       ]].sum(axis=1).fillna(0)

        json_df = df[['x', 'y', 'ox', 'oy', 'term']]

        if self.scatterchartdata.term_significance:
            json_df['p'] = df['p']
        self._add_term_freq_to_json_df(json_df, df, category)
        json_df['s'] = percentile_min(df['color_scores'])
        json_df['os'] = df['color_scores']
        if background_scorer:
            bg_scores = background_scorer.get_scores(self.term_doc_matrix)
            json_df['bg'] = bg_scores[1].loc[json_df.term].values
        elif not self.scatterchartdata.use_non_text_features:
            json_df['bg'] = self._get_corpus_characteristic_scores(json_df)

        self._preform_axis_rescale(json_df, self._rescale_x, 'x')
        self._preform_axis_rescale(json_df, self._rescale_y, 'y')

        if self.scatterchartdata.terms_to_include is not None:
            json_df = self._use_only_selected_terms(json_df)

        category_terms = list(json_df.sort_values('s')['term'][:10])
        not_category_terms = list(json_df.sort_values('s')['term'][:10])
        if category_name is None:
            category_name = category
        if not_category_name is None:
            not_category_name = 'Not ' + category_name

        def better_title(x):
            if title_case_names:
                return ' '.join(
                    [t[0].upper() + t[1:].lower() for t in x.split()])
            else:
                return x

        j = {
            'info': {
                'category_name': better_title(category_name),
                'not_category_name': better_title(not_category_name),
                'category_terms': category_terms,
                'not_category_terms': not_category_terms,
                'category_internal_name': category,
                'not_category_internal_names': not_categories,
                'categories': self.term_doc_matrix.get_categories(),
                'neutral_category_internal_names': neutral_categories,
                'extra_category_internal_names': extra_categories
            }
        }
        if self.metadata_term_lists is not None:
            j['metalists'] = self.metadata_term_lists
        if self.metadata_descriptions is not None:
            j['metadescriptions'] = self.metadata_descriptions
        if self.term_colors is not None:
            j['info']['term_colors'] = self.term_colors
        #j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(orient='records')
        j['data'] = json_df.to_dict(orient='records')

        return j
Example #2
0
	def to_dict(self,
	            category,
	            category_name=None,
	            not_category_name=None,
	            scores=None,
	            transform=percentile_alphabetical):
		'''

		Parameters
		----------
		category : str
			Category to annotate.  Exact value of category.
		category_name : str, optional
			Name of category which will appear on web site. Default None is same as category.
		not_category_name : str, optional
			Name of ~category which will appear on web site. Default None is same as "not " + category.
		scores : np.array, optional
			Scores to use for coloring.  Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category))
		transform : function, optional
			Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.

		Returns
		-------
		Dictionary that encodes the scatter chart
		information. The dictionary can be dumped as a json document, and
		used in scattertext.html
		 {info: {category_name: ..., not_category_name},
		  data: [{term:,
		          x:frequency [0-1],
		          y:frequency [0-1],
              s: score,
              os: original score,
              p: p-val,
              cat25k: freq per 25k in category,
              cat: count in category,
              ncat: count in non-category,
              catdocs: [docnum, ...],
              ncatdocs: [docnum, ...]
              ncat25k: freq per 25k in non-category}, ...]}}

		'''
		all_categories, other_categories = self._get_category_names(category)
		df = self._term_rank_score_and_frequency_df(all_categories, category, scores)
		if self.x_coords is None:
			self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \
				(category, df, other_categories, transform)
			df['x'], df['y'] = self.x_coords, self.y_coords
		df['not cat freq'] = df[[x for x in other_categories]].sum(axis=1)
		json_df = df[['x', 'y', 'term']]
		if self.scatterchartdata.term_significance:
			json_df['p'] = df['p']
		self._add_term_freq_to_json_df(json_df, df, category)
		json_df['s'] = percentile_min(df['color_scores'])
		json_df['os'] = df['color_scores']
		if not self.scatterchartdata.use_non_text_features:
			json_df['bg'] = self._get_corpus_characteristic_scores(json_df)

		category_terms = list(json_df.sort_values('s')['term'][:10])
		not_category_terms = list(json_df.sort_values('s')['term'][:10])
		if category_name is None:
			category_name = category
		if not_category_name is None:
			not_category_name = 'Not ' + category_name

		def better_title(x):
			return ' '.join([t[0].upper() + t[1:].lower() for t in x.split()])

		j = {'info': {'category_name': better_title(category_name),
		              'not_category_name': better_title(not_category_name),
		              'category_terms': category_terms,
		              'not_category_terms': not_category_terms,
		              'category_internal_name': category}}
		j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(orient='records')
		return j
Example #3
0
    def to_dict(self,
                category,
                category_name=None,
                not_category_name=None,
                scores=None,
                transform=percentile_alphabetical,
                title_case_names=False,
                not_categories=None):
        '''

		Parameters
		----------
		category : str
			Category to annotate.  Exact value of category.
		category_name : str, optional
			Name of category which will appear on web site. Default None is same as category.
		not_category_name : str, optional
			Name of ~category which will appear on web site. Default None is same as "not " + category.
		scores : np.array, optional
			Scores to use for coloring.  Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category))
		transform : function, optional
			Function for ranking terms.  Defaults to scattertext.Scalers.percentile_lexicographic.
		title_case_names : bool, default False
		  Title case category name and no-category name?
		not_categories : list, optional
			List of categories to use as "not category".  Defaults to all others.

		Returns
		-------
		Dictionary that encodes the scatter chart
		information. The dictionary can be dumped as a json document, and
		used in scattertext.html
		 {info: {category_name: ..., not_category_name},
		  data: [{term:,
		          x:frequency [0-1],
		          y:frequency [0-1],
		          ox: score,
		          oy: score,
              s: score,
              os: original score,
              p: p-val,
              cat25k: freq per 25k in category,
              cat: count in category,
              ncat: count in non-category,
              catdocs: [docnum, ...],
              ncatdocs: [docnum, ...]
              ncat25k: freq per 25k in non-category}, ...]}}

		'''
        if self.used:
            raise Exception("Cannot reuse a ScatterChart constructor")
        self.used = True
        all_categories, other_categories = self._get_category_names(category)
        neutral_categories = []
        if not_categories is not None:
            assert set(not_categories) - set(
                c[:-5] for c in other_categories) == set()
            other_categories = [c + ' freq' for c in not_categories]
            neutral_categories = [
                c[:-5] for c in all_categories
                if c != category + ' freq' and c not in other_categories
            ]
        df = self._term_rank_score_and_frequency_df(all_categories, category,
                                                    other_categories, scores)

        if self.x_coords is None:
            self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \
             (category, df, other_categories, transform)
            df['x'], df['y'] = self.x_coords, self.y_coords
            df['ox'], df['oy'] = self.x_coords, self.y_coords

        df['not cat freq'] = df[[x for x in other_categories]].sum(axis=1)
        if neutral_categories != []:
            df['neut cat freq'] = df[[x + ' freq' for x in neutral_categories
                                      ]].sum(axis=1).fillna(0)
        json_df = df[['x', 'y', 'ox', 'oy', 'term']]

        if self.scatterchartdata.term_significance:
            json_df['p'] = df['p']
        self._add_term_freq_to_json_df(json_df, df, category)
        json_df['s'] = percentile_min(df['color_scores'])
        json_df['os'] = df['color_scores']
        if not self.scatterchartdata.use_non_text_features:
            json_df['bg'] = self._get_corpus_characteristic_scores(json_df)

        self._preform_axis_rescale(json_df, self._rescale_x, 'x')
        self._preform_axis_rescale(json_df, self._rescale_y, 'y')

        if self.scatterchartdata.terms_to_include is not None:
            json_df = self._use_only_selected_terms(json_df)

        category_terms = list(json_df.sort_values('s')['term'][:10])
        not_category_terms = list(json_df.sort_values('s')['term'][:10])
        if category_name is None:
            category_name = category
        if not_category_name is None:
            not_category_name = 'Not ' + category_name

        def better_title(x):
            if title_case_names:
                return ' '.join(
                    [t[0].upper() + t[1:].lower() for t in x.split()])
            else:
                return x

        j = {
            'info': {
                'category_name': better_title(category_name),
                'not_category_name': better_title(not_category_name),
                'category_terms': category_terms,
                'not_category_terms': not_category_terms,
                'category_internal_name': category,
                'not_category_internal_names':
                [c[:-5] for c in other_categories],
                'categories': self.term_doc_matrix.get_categories()
            }
        }
        j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(
            orient='records')
        return j