def to_dict(self, category, category_name=None, not_category_name=None, scores=None, transform=percentile_alphabetical, title_case_names=False, not_categories=None, neutral_categories=None, extra_categories=None, background_scorer=None): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or RankDifference scores transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. title_case_names : bool, default False Title case category name and no-category name? not_categories : list, optional List of categories to use as "not category". Defaults to all others. neutral_categories : list, optional List of categories to use as neutral. Defaults []. extra_categories : list, optional List of categories to use as extra. Defaults []. background_scorer : CharacteristicScorer, optional Used for bg scores Returns ------- Dictionary that encodes the scatter chart information. The dictionary can be dumped as a json document, and used in scattertext.html {info: {category_name: ..., not_category_name}, data: [{term:, x:frequency [0-1], y:frequency [0-1], ox: score, oy: score, s: score, os: original score, p: p-val, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}, ...]}} ''' if self.used: raise Exception("Cannot reuse a ScatterChart constructor") all_categories = self.term_doc_matrix.get_categories() assert category in all_categories if not_categories is None: not_categories = [c for c in all_categories if c != category] neutral_categories = [] extra_categories = [] elif neutral_categories is None: neutral_categories = [ c for c in all_categories if c not in [category] + not_categories ] extra_categories = [] elif extra_categories is None: extra_categories = [ c for c in all_categories if c not in [category] + not_categories + neutral_categories ] all_categories = [ category ] + not_categories + neutral_categories + extra_categories df = self._get_term_category_frequencies() self._add_x_and_y_coords_to_term_df_if_injected(df) if scores is None: scores = self._get_default_scores(category, not_categories, df) category_column_name = category + ' freq' df['category score'] = CornerScore.get_scores_for_category( df[category_column_name], df[[c + ' freq' for c in not_categories]].sum(axis=1)) if self.scatterchartdata.term_significance is not None: df['p'] = get_p_vals(df, category_column_name, self.scatterchartdata.term_significance) df['not category score'] = CornerScore.get_scores_for_category( df[[c + ' freq' for c in not_categories]].sum(axis=1), df[category_column_name]) df['color_scores'] = scores if self.scatterchartdata.terms_to_include is None: df = self._filter_bigrams_by_minimum_not_category_term_freq( category_column_name, not_categories, df) df = filter_bigrams_by_pmis( self._filter_by_minimum_term_frequency(all_categories, df), threshold_coef=self.scatterchartdata.pmi_threshold_coefficient) if self.scatterchartdata.filter_unigrams: df = filter_out_unigrams_that_only_occur_in_one_bigram(df) if len(df) == 0: raise NoWordMeetsTermFrequencyRequirementsError() df['category score rank'] = rankdata(df['category score'], method='ordinal') df['not category score rank'] = rankdata(df['not category score'], method='ordinal') if self.scatterchartdata.max_terms and self.scatterchartdata.max_terms < len( df): assert self.scatterchartdata.max_terms > 0 df = self._limit_max_terms(category, df) df = df.reset_index() if self.x_coords is None: self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \ (category, df, not_categories, transform) df['x'], df['y'] = self.x_coords, self.y_coords df['ox'], df['oy'] = self.x_coords, self.y_coords df['not cat freq'] = df[[x + ' freq' for x in not_categories]].sum(axis=1) if neutral_categories != []: df['neut cat freq'] = df[[x + ' freq' for x in neutral_categories ]].sum(axis=1).fillna(0) if extra_categories != []: df['extra cat freq'] = df[[x + ' freq' for x in extra_categories ]].sum(axis=1).fillna(0) json_df = df[['x', 'y', 'ox', 'oy', 'term']] if self.scatterchartdata.term_significance: json_df['p'] = df['p'] self._add_term_freq_to_json_df(json_df, df, category) json_df['s'] = percentile_min(df['color_scores']) json_df['os'] = df['color_scores'] if background_scorer: bg_scores = background_scorer.get_scores(self.term_doc_matrix) json_df['bg'] = bg_scores[1].loc[json_df.term].values elif not self.scatterchartdata.use_non_text_features: json_df['bg'] = self._get_corpus_characteristic_scores(json_df) self._preform_axis_rescale(json_df, self._rescale_x, 'x') self._preform_axis_rescale(json_df, self._rescale_y, 'y') if self.scatterchartdata.terms_to_include is not None: json_df = self._use_only_selected_terms(json_df) category_terms = list(json_df.sort_values('s')['term'][:10]) not_category_terms = list(json_df.sort_values('s')['term'][:10]) if category_name is None: category_name = category if not_category_name is None: not_category_name = 'Not ' + category_name def better_title(x): if title_case_names: return ' '.join( [t[0].upper() + t[1:].lower() for t in x.split()]) else: return x j = { 'info': { 'category_name': better_title(category_name), 'not_category_name': better_title(not_category_name), 'category_terms': category_terms, 'not_category_terms': not_category_terms, 'category_internal_name': category, 'not_category_internal_names': not_categories, 'categories': self.term_doc_matrix.get_categories(), 'neutral_category_internal_names': neutral_categories, 'extra_category_internal_names': extra_categories } } if self.metadata_term_lists is not None: j['metalists'] = self.metadata_term_lists if self.metadata_descriptions is not None: j['metadescriptions'] = self.metadata_descriptions if self.term_colors is not None: j['info']['term_colors'] = self.term_colors #j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(orient='records') j['data'] = json_df.to_dict(orient='records') return j
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, transform=percentile_alphabetical): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category)) transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. Returns ------- Dictionary that encodes the scatter chart information. The dictionary can be dumped as a json document, and used in scattertext.html {info: {category_name: ..., not_category_name}, data: [{term:, x:frequency [0-1], y:frequency [0-1], s: score, os: original score, p: p-val, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}, ...]}} ''' all_categories, other_categories = self._get_category_names(category) df = self._term_rank_score_and_frequency_df(all_categories, category, scores) if self.x_coords is None: self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \ (category, df, other_categories, transform) df['x'], df['y'] = self.x_coords, self.y_coords df['not cat freq'] = df[[x for x in other_categories]].sum(axis=1) json_df = df[['x', 'y', 'term']] if self.scatterchartdata.term_significance: json_df['p'] = df['p'] self._add_term_freq_to_json_df(json_df, df, category) json_df['s'] = percentile_min(df['color_scores']) json_df['os'] = df['color_scores'] if not self.scatterchartdata.use_non_text_features: json_df['bg'] = self._get_corpus_characteristic_scores(json_df) category_terms = list(json_df.sort_values('s')['term'][:10]) not_category_terms = list(json_df.sort_values('s')['term'][:10]) if category_name is None: category_name = category if not_category_name is None: not_category_name = 'Not ' + category_name def better_title(x): return ' '.join([t[0].upper() + t[1:].lower() for t in x.split()]) j = {'info': {'category_name': better_title(category_name), 'not_category_name': better_title(not_category_name), 'category_terms': category_terms, 'not_category_terms': not_category_terms, 'category_internal_name': category}} j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict(orient='records') return j
def to_dict(self, category, category_name=None, not_category_name=None, scores=None, transform=percentile_alphabetical, title_case_names=False, not_categories=None): ''' Parameters ---------- category : str Category to annotate. Exact value of category. category_name : str, optional Name of category which will appear on web site. Default None is same as category. not_category_name : str, optional Name of ~category which will appear on web site. Default None is same as "not " + category. scores : np.array, optional Scores to use for coloring. Defaults to None, or np.array(self.term_doc_matrix.get_scaled_f_scores(category)) transform : function, optional Function for ranking terms. Defaults to scattertext.Scalers.percentile_lexicographic. title_case_names : bool, default False Title case category name and no-category name? not_categories : list, optional List of categories to use as "not category". Defaults to all others. Returns ------- Dictionary that encodes the scatter chart information. The dictionary can be dumped as a json document, and used in scattertext.html {info: {category_name: ..., not_category_name}, data: [{term:, x:frequency [0-1], y:frequency [0-1], ox: score, oy: score, s: score, os: original score, p: p-val, cat25k: freq per 25k in category, cat: count in category, ncat: count in non-category, catdocs: [docnum, ...], ncatdocs: [docnum, ...] ncat25k: freq per 25k in non-category}, ...]}} ''' if self.used: raise Exception("Cannot reuse a ScatterChart constructor") self.used = True all_categories, other_categories = self._get_category_names(category) neutral_categories = [] if not_categories is not None: assert set(not_categories) - set( c[:-5] for c in other_categories) == set() other_categories = [c + ' freq' for c in not_categories] neutral_categories = [ c[:-5] for c in all_categories if c != category + ' freq' and c not in other_categories ] df = self._term_rank_score_and_frequency_df(all_categories, category, other_categories, scores) if self.x_coords is None: self.x_coords, self.y_coords = self._get_coordinates_from_transform_and_jitter_frequencies \ (category, df, other_categories, transform) df['x'], df['y'] = self.x_coords, self.y_coords df['ox'], df['oy'] = self.x_coords, self.y_coords df['not cat freq'] = df[[x for x in other_categories]].sum(axis=1) if neutral_categories != []: df['neut cat freq'] = df[[x + ' freq' for x in neutral_categories ]].sum(axis=1).fillna(0) json_df = df[['x', 'y', 'ox', 'oy', 'term']] if self.scatterchartdata.term_significance: json_df['p'] = df['p'] self._add_term_freq_to_json_df(json_df, df, category) json_df['s'] = percentile_min(df['color_scores']) json_df['os'] = df['color_scores'] if not self.scatterchartdata.use_non_text_features: json_df['bg'] = self._get_corpus_characteristic_scores(json_df) self._preform_axis_rescale(json_df, self._rescale_x, 'x') self._preform_axis_rescale(json_df, self._rescale_y, 'y') if self.scatterchartdata.terms_to_include is not None: json_df = self._use_only_selected_terms(json_df) category_terms = list(json_df.sort_values('s')['term'][:10]) not_category_terms = list(json_df.sort_values('s')['term'][:10]) if category_name is None: category_name = category if not_category_name is None: not_category_name = 'Not ' + category_name def better_title(x): if title_case_names: return ' '.join( [t[0].upper() + t[1:].lower() for t in x.split()]) else: return x j = { 'info': { 'category_name': better_title(category_name), 'not_category_name': better_title(not_category_name), 'category_terms': category_terms, 'not_category_terms': not_category_terms, 'category_internal_name': category, 'not_category_internal_names': [c[:-5] for c in other_categories], 'categories': self.term_doc_matrix.get_categories() } } j['data'] = json_df.sort_values(by=['x', 'y', 'term']).to_dict( orient='records') return j